From b95fb6c3d66b7e8b40cfbb4e000e7e023263b2b9 Mon Sep 17 00:00:00 2001 From: yangxinyu Date: Sat, 18 May 2024 11:50:09 +0000 Subject: [PATCH 1/3] change pass order in affine-opt --- compiler/lib/Pipelines/AffineOpt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler/lib/Pipelines/AffineOpt.cpp b/compiler/lib/Pipelines/AffineOpt.cpp index e29492ccb..857747836 100644 --- a/compiler/lib/Pipelines/AffineOpt.cpp +++ b/compiler/lib/Pipelines/AffineOpt.cpp @@ -39,8 +39,8 @@ void addGenericAffineOptPasses(OpPassManager &pm) { pm.addNestedPass(createLoopCoalescingPass()); pm.addNestedPass(createLoopFusionPass()); pm.addNestedPass(createSimplifyAffineStructuresPass()); - pm.addPass(memref::createFoldMemRefAliasOpsPass()); pm.addPass(createLowerAffinePass()); + pm.addPass(memref::createFoldMemRefAliasOpsPass()); pm.addPass(arith::createIntRangeOptimizationsPass()); addCleanUpExtPassPipeline(pm); } From 90f102c38934cf7290e884992bce528b128f3fd3 Mon Sep 17 00:00:00 2001 From: yangxinyu Date: Sat, 18 May 2024 11:51:20 +0000 Subject: [PATCH 2/3] fix SetSpaceOptPipeline in gen_testcases.py --- compiler/scripts/gen_testcases.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler/scripts/gen_testcases.py b/compiler/scripts/gen_testcases.py index 18971330a..53a9551c5 100644 --- a/compiler/scripts/gen_testcases.py +++ b/compiler/scripts/gen_testcases.py @@ -138,6 +138,8 @@ def ByreTensorOptPipeline(filecheck, *, entryFunc="main"): def SetSpaceOptPipeline(filecheck, *, entryFunc="main"): return OptPipeline(E2ECollections.SetSpaceOpt, [E2ECollections.ByreOpt], [ "-remove-func-body=\"anchor-attr=__byteir_elementwise_fusion__\"", + "-inline", + "-gpu-launch-func-to-byre", "-set-op-space=\"entry-func={} space=cuda\"".format(entryFunc), "-set-arg-space=\"entry-func={} all-space=cuda\"".format(entryFunc) ], filecheck) From 577af8ebbe71f123cd721d47fea48107773c1ce2 Mon Sep 17 00:00:00 2001 From: yangxinyu Date: Sat, 18 May 2024 11:53:48 +0000 Subject: [PATCH 3/3] add python3 scripts/gen_testcases.py --top-dir=test/E2E --category=E2E result --- .../E2E/MLPInference/10b_ptx_codegen.mlir | 203 +- .../E2E/MLPInference/2_linalg_tensor_opt.mlir | 31 +- .../E2E/MLPInference/3_byre_tensor_opt.mlir | 88 +- .../E2E/MLPInference/4_bufferize_opt.mlir | 90 +- .../test/E2E/MLPInference/5_affine_opt.mlir | 84 +- .../MLPInference/5_alternative_scf_opt.mlir | 84 +- compiler/test/E2E/MLPInference/6_gpu_opt.mlir | 105 +- .../E2E/MLPInference/7_set_space_opt.mlir | 140 +- .../test/E2E/MLPInference/8_byre_opt.mlir | 124 +- .../test/E2E/MLPInference/9a_byre_host.mlir | 129 +- .../E2E/MLPInference/9b_nvvm_codegen.mlir | 129 +- .../test/E2E/MLPInference/device_output.ptx | 226 +- .../test/E2E/MLPInference/host_output.mlir | 34 +- .../test/E2E/ResNet18/BW/10b_ptx_codegen.mlir | 3968 ++++------ .../E2E/ResNet18/BW/2_linalg_tensor_opt.mlir | 599 +- .../E2E/ResNet18/BW/3_byre_tensor_opt.mlir | 1329 ++-- .../test/E2E/ResNet18/BW/4_bufferize_opt.mlir | 954 ++- .../test/E2E/ResNet18/BW/5_affine_opt.mlir | 806 +- .../ResNet18/BW/5_alternative_scf_opt.mlir | 806 +- compiler/test/E2E/ResNet18/BW/6_gpu_opt.mlir | 1648 +---- .../test/E2E/ResNet18/BW/7_set_space_opt.mlir | 2161 ++---- compiler/test/E2E/ResNet18/BW/8_byre_opt.mlir | 1898 +---- .../test/E2E/ResNet18/BW/9a_byre_host.mlir | 1984 ++--- .../test/E2E/ResNet18/BW/9b_nvvm_codegen.mlir | 1984 ++--- .../test/E2E/ResNet18/BW/device_output.ptx | 4016 ++-------- .../test/E2E/ResNet18/BW/host_output.mlir | 216 +- .../test/E2E/ResNet18/FW/10b_ptx_codegen.mlir | 5517 ++++---------- .../E2E/ResNet18/FW/2_linalg_tensor_opt.mlir | 734 +- .../E2E/ResNet18/FW/3_byre_tensor_opt.mlir | 1854 +++-- .../test/E2E/ResNet18/FW/4_bufferize_opt.mlir | 1637 ++-- .../test/E2E/ResNet18/FW/5_affine_opt.mlir | 1445 ++-- .../ResNet18/FW/5_alternative_scf_opt.mlir | 1445 ++-- compiler/test/E2E/ResNet18/FW/6_gpu_opt.mlir | 2391 +----- .../test/E2E/ResNet18/FW/7_set_space_opt.mlir | 3642 ++------- compiler/test/E2E/ResNet18/FW/8_byre_opt.mlir | 2858 ++----- .../test/E2E/ResNet18/FW/9a_byre_host.mlir | 2903 ++------ .../test/E2E/ResNet18/FW/9b_nvvm_codegen.mlir | 2903 ++------ .../test/E2E/ResNet18/FW/device_output.ptx | 6317 +++------------- .../test/E2E/ResNet18/FW/host_output.mlir | 272 +- .../ResNet18/Whole/2_linalg_tensor_opt.mlir | 1172 +-- .../E2E/ResNet18/Whole/3_byre_tensor_opt.mlir | 3681 +++++---- .../E2E/ResNet18/Whole/4_bufferize_opt.mlir | 3369 ++++++--- .../test/E2E/ResNet18/Whole/5_affine_opt.mlir | 2874 +++++--- .../ResNet18/Whole/5_alternative_scf_opt.mlir | 2874 +++++--- .../test/E2E/ResNet18/Whole/6_gpu_opt.mlir | 5041 ++++--------- .../E2E/ResNet18/Whole/7_set_space_opt.mlir | 6567 ++++++----------- .../test/E2E/ResNet18/Whole/8_byre_opt.mlir | 5505 +++++--------- .../test/E2E/ResNet18/Whole/9a_byre_host.mlir | 5691 +++++--------- .../E2E/ResNet18/Whole/9b_nvvm_codegen.mlir | 5698 +++++--------- .../test/E2E/ResNet18/Whole/host_output.mlir | 619 +- 50 files changed, 34889 insertions(+), 65956 deletions(-) diff --git a/compiler/test/E2E/MLPInference/10b_ptx_codegen.mlir b/compiler/test/E2E/MLPInference/10b_ptx_codegen.mlir index 25d987adc..47adb94ce 100644 --- a/compiler/test/E2E/MLPInference/10b_ptx_codegen.mlir +++ b/compiler/test/E2E/MLPInference/10b_ptx_codegen.mlir @@ -4,7 +4,7 @@ module attributes {byre.container_module, gpu.container_module, torch.debug_module_name = "GraphModule"} { gpu.module @unified { - llvm.func @Unknown2(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr, %arg13: !llvm.ptr, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown2(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr {llvm.noalias}, %arg13: !llvm.ptr {llvm.noalias}, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -20,95 +20,46 @@ module attributes {byre.container_module, gpu.container_module, torch.debug_modu %12 = llvm.mlir.constant(0 : index) : i64 %13 = llvm.mlir.constant(20 : index) : i64 %14 = llvm.mlir.constant(10 : index) : i64 - %15 = llvm.mlir.constant(-1 : index) : i64 - %16 = nvvm.read.ptx.sreg.ctaid.x : i32 - %17 = llvm.sext %16 : i32 to i64 - %18 = nvvm.read.ptx.sreg.ntid.x : i32 - %19 = llvm.sext %18 : i32 to i64 - %20 = nvvm.read.ptx.sreg.tid.x : i32 - %21 = llvm.sext %20 : i32 to i64 - %22 = llvm.mul %19, %17 : i64 - %23 = llvm.add %21, %22 : i64 - %24 = llvm.icmp "slt" %23, %13 : i64 - llvm.cond_br %24, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %25 = llvm.srem %23, %14 : i64 - %26 = llvm.icmp "slt" %25, %12 : i64 - %27 = llvm.add %25, %14 : i64 - %28 = llvm.select %26, %27, %25 : i1, i64 - %29 = llvm.icmp "slt" %23, %12 : i64 - %30 = llvm.sub %15, %23 : i64 - %31 = llvm.select %29, %30, %23 : i1, i64 - %32 = llvm.sdiv %31, %14 : i64 - %33 = llvm.sub %15, %32 : i64 - %34 = llvm.select %29, %33, %32 : i1, i64 - %35 = llvm.mul %34, %14 : i64 - %36 = llvm.add %35, %28 : i64 - %37 = llvm.getelementptr %arg6[%36] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %38 = llvm.load %37 : !llvm.ptr -> f32 - %39 = llvm.getelementptr %arg1[%28] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %40 = llvm.load %39 : !llvm.ptr -> f32 - %41 = llvm.fadd %38, %40 : f32 - %42 = llvm.getelementptr %arg13[%36] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %41, %42 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown1(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr, %arg13: !llvm.ptr, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %4 = llvm.insertvalue %arg5, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %5 = llvm.insertvalue %arg6, %4[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %6 = llvm.insertvalue %arg7, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %7 = llvm.insertvalue %arg8, %6[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %8 = llvm.insertvalue %arg12, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %9 = llvm.insertvalue %arg13, %8[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %10 = llvm.insertvalue %arg14, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %11 = llvm.insertvalue %arg15, %10[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %12 = llvm.mlir.constant(0.000000e+00 : f32) : f32 - %13 = llvm.mlir.constant(0 : index) : i64 - %14 = llvm.mlir.constant(40 : index) : i64 - %15 = llvm.mlir.constant(20 : index) : i64 - %16 = llvm.mlir.constant(-1 : index) : i64 - %17 = nvvm.read.ptx.sreg.ctaid.x : i32 + %15 = nvvm.read.ptx.sreg.ctaid.x : i32 + %16 = llvm.sext %15 : i32 to i64 + %17 = nvvm.read.ptx.sreg.ntid.x : i32 %18 = llvm.sext %17 : i32 to i64 - %19 = nvvm.read.ptx.sreg.ntid.x : i32 + %19 = nvvm.read.ptx.sreg.tid.x : i32 %20 = llvm.sext %19 : i32 to i64 - %21 = nvvm.read.ptx.sreg.tid.x : i32 - %22 = llvm.sext %21 : i32 to i64 - %23 = llvm.mul %20, %18 : i64 - %24 = llvm.add %22, %23 : i64 - %25 = llvm.icmp "slt" %24, %14 : i64 - llvm.cond_br %25, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %26 = llvm.srem %24, %15 : i64 + %21 = llvm.mul %18, %16 : i64 + %22 = llvm.add %20, %21 : i64 + %23 = nvvm.read.ptx.sreg.nctaid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %18, %24 : i64 + llvm.br ^bb1(%22 : i64) + ^bb1(%26: i64): // 2 preds: ^bb0, ^bb2 %27 = llvm.icmp "slt" %26, %13 : i64 - %28 = llvm.add %26, %15 : i64 - %29 = llvm.select %27, %28, %26 : i1, i64 - %30 = llvm.icmp "slt" %24, %13 : i64 - %31 = llvm.sub %16, %24 : i64 - %32 = llvm.select %30, %31, %24 : i1, i64 - %33 = llvm.sdiv %32, %15 : i64 - %34 = llvm.sub %16, %33 : i64 - %35 = llvm.select %30, %34, %33 : i1, i64 - %36 = llvm.mul %35, %15 : i64 - %37 = llvm.add %36, %29 : i64 - %38 = llvm.getelementptr %arg6[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.cond_br %27, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %28 = llvm.srem %26, %14 : i64 + %29 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %30 = llvm.mlir.constant(1 : index) : i64 + %31 = llvm.getelementptr %arg1[%28] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %32 = llvm.load %31 : !llvm.ptr -> f32 + %33 = llvm.insertvalue %26, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %34 = llvm.insertvalue %30, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %35 = llvm.getelementptr %arg6[%26] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %36 = llvm.mul %12, %14 : i64 + %37 = llvm.add %36, %12 : i64 + %38 = llvm.getelementptr %35[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32 %39 = llvm.load %38 : !llvm.ptr -> f32 - %40 = llvm.getelementptr %arg1[%29] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %41 = llvm.load %40 : !llvm.ptr -> f32 - %42 = llvm.fadd %39, %41 : f32 - %43 = llvm.intr.maxnum(%42, %12) : (f32, f32) -> f32 - %44 = llvm.getelementptr %arg13[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %43, %44 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %40 = llvm.fadd %39, %32 : f32 + %41 = llvm.insertvalue %26, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %42 = llvm.insertvalue %30, %41[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %43 = llvm.getelementptr %arg13[%26] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %44 = llvm.getelementptr %43[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %40, %44 : f32, !llvm.ptr + %45 = llvm.add %26, %25 : i64 + llvm.br ^bb1(%45 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr, %arg13: !llvm.ptr, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown0(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr {llvm.noalias}, %arg13: !llvm.ptr {llvm.noalias}, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -121,51 +72,55 @@ module attributes {byre.container_module, gpu.container_module, torch.debug_modu %9 = llvm.insertvalue %arg13, %8[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> %10 = llvm.insertvalue %arg14, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> %11 = llvm.insertvalue %arg15, %10[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %12 = llvm.mlir.constant(0.000000e+00 : f32) : f32 - %13 = llvm.mlir.constant(0 : index) : i64 - %14 = llvm.mlir.constant(40 : index) : i64 + %12 = llvm.mlir.constant(0 : index) : i64 + %13 = llvm.mlir.constant(40 : index) : i64 + %14 = llvm.mlir.constant(0.000000e+00 : f32) : f32 %15 = llvm.mlir.constant(20 : index) : i64 - %16 = llvm.mlir.constant(-1 : index) : i64 - %17 = nvvm.read.ptx.sreg.ctaid.x : i32 - %18 = llvm.sext %17 : i32 to i64 - %19 = nvvm.read.ptx.sreg.ntid.x : i32 - %20 = llvm.sext %19 : i32 to i64 - %21 = nvvm.read.ptx.sreg.tid.x : i32 - %22 = llvm.sext %21 : i32 to i64 - %23 = llvm.mul %20, %18 : i64 - %24 = llvm.add %22, %23 : i64 - %25 = llvm.icmp "slt" %24, %14 : i64 - llvm.cond_br %25, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %26 = llvm.srem %24, %15 : i64 - %27 = llvm.icmp "slt" %26, %13 : i64 - %28 = llvm.add %26, %15 : i64 - %29 = llvm.select %27, %28, %26 : i1, i64 - %30 = llvm.icmp "slt" %24, %13 : i64 - %31 = llvm.sub %16, %24 : i64 - %32 = llvm.select %30, %31, %24 : i1, i64 - %33 = llvm.sdiv %32, %15 : i64 - %34 = llvm.sub %16, %33 : i64 - %35 = llvm.select %30, %34, %33 : i1, i64 - %36 = llvm.mul %35, %15 : i64 - %37 = llvm.add %36, %29 : i64 - %38 = llvm.getelementptr %arg6[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %39 = llvm.load %38 : !llvm.ptr -> f32 - %40 = llvm.getelementptr %arg1[%29] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %41 = llvm.load %40 : !llvm.ptr -> f32 - %42 = llvm.fadd %39, %41 : f32 - %43 = llvm.intr.maxnum(%42, %12) : (f32, f32) -> f32 - %44 = llvm.getelementptr %arg13[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %43, %44 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %16 = nvvm.read.ptx.sreg.ctaid.x : i32 + %17 = llvm.sext %16 : i32 to i64 + %18 = nvvm.read.ptx.sreg.ntid.x : i32 + %19 = llvm.sext %18 : i32 to i64 + %20 = nvvm.read.ptx.sreg.tid.x : i32 + %21 = llvm.sext %20 : i32 to i64 + %22 = llvm.mul %19, %17 : i64 + %23 = llvm.add %21, %22 : i64 + %24 = nvvm.read.ptx.sreg.nctaid.x : i32 + %25 = llvm.sext %24 : i32 to i64 + %26 = llvm.mul %19, %25 : i64 + llvm.br ^bb1(%23 : i64) + ^bb1(%27: i64): // 2 preds: ^bb0, ^bb2 + %28 = llvm.icmp "slt" %27, %13 : i64 + llvm.cond_br %28, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %29 = llvm.srem %27, %15 : i64 + %30 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %31 = llvm.mlir.constant(1 : index) : i64 + %32 = llvm.getelementptr %arg1[%29] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %33 = llvm.load %32 : !llvm.ptr -> f32 + %34 = llvm.insertvalue %27, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %35 = llvm.insertvalue %31, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %36 = llvm.getelementptr %arg6[%27] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %37 = llvm.mul %12, %15 : i64 + %38 = llvm.add %37, %12 : i64 + %39 = llvm.getelementptr %36[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %40 = llvm.load %39 : !llvm.ptr -> f32 + %41 = llvm.fadd %40, %33 : f32 + %42 = llvm.intr.maximum(%41, %14) : (f32, f32) -> f32 + %43 = llvm.insertvalue %27, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %44 = llvm.insertvalue %31, %43[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %45 = llvm.getelementptr %arg13[%27] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %46 = llvm.getelementptr %45[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %42, %46 : f32, !llvm.ptr + %47 = llvm.add %27, %26 : i64 + llvm.br ^bb1(%47 : i64) + ^bb3: // pred: ^bb1 llvm.return } } - memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> - memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> - memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> memref.global "private" constant @__constant_10xf32_cuda : memref<10xf32, "cuda"> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> + memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> memref.global "private" constant @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> + memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> memref.global "private" constant @__constant_20xf32_cuda : memref<20xf32, "cuda"> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> + memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> } \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/2_linalg_tensor_opt.mlir b/compiler/test/E2E/MLPInference/2_linalg_tensor_opt.mlir index df028cb88..473a2a1a6 100644 --- a/compiler/test/E2E/MLPInference/2_linalg_tensor_opt.mlir +++ b/compiler/test/E2E/MLPInference/2_linalg_tensor_opt.mlir @@ -10,31 +10,24 @@ module attributes {torch.debug_module_name = "GraphModule"} { %3 = mhlo.maximum %2, %0 : tensor<2x20xf32> return %3 : tensor<2x20xf32> } - func.func private @Unknown1(%arg0: tensor<20xf32>, %arg1: tensor<2x20xf32>) -> tensor<2x20xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<2x20xf32> - %1 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<20xf32>) -> tensor<2x20xf32> - %2 = mhlo.add %arg1, %1 : tensor<2x20xf32> - %3 = mhlo.maximum %2, %0 : tensor<2x20xf32> - return %3 : tensor<2x20xf32> - } func.func private @Unknown2(%arg0: tensor<10xf32>, %arg1: tensor<2x10xf32>) -> tensor<2x10xf32> attributes {__byteir_elementwise_fusion__} { %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<10xf32>) -> tensor<2x10xf32> %1 = mhlo.add %arg1, %0 : tensor<2x10xf32> return %1 : tensor<2x10xf32> } func.func @forward(%arg0: tensor<2x10xf32>) -> tensor<2x10xf32> { - %0 = mhlo.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32> - %1 = mhlo.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32> - %2 = mhlo.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32> - %3 = mhlo.constant dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32> - %4 = mhlo.constant dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32> - %5 = mhlo.constant dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32> - %6 = "mhlo.dot"(%arg0, %5) : (tensor<2x10xf32>, tensor<10x20xf32>) -> tensor<2x20xf32> - %7 = call @Unknown0(%0, %6) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> - %8 = "mhlo.dot"(%7, %4) : (tensor<2x20xf32>, tensor<20x20xf32>) -> tensor<2x20xf32> - %9 = call @Unknown1(%1, %8) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> - %10 = "mhlo.dot"(%9, %3) : (tensor<2x20xf32>, tensor<20x10xf32>) -> tensor<2x10xf32> - %11 = call @Unknown2(%2, %10) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32> + %0 = mhlo.constant dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32> + %1 = mhlo.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32> + %2 = mhlo.constant dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32> + %3 = mhlo.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32> + %4 = mhlo.constant dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32> + %5 = mhlo.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32> + %6 = "mhlo.dot_general"(%arg0, %0) {dot_dimension_numbers = #mhlo.dot} : (tensor<2x10xf32>, tensor<20x10xf32>) -> tensor<2x20xf32> + %7 = call @Unknown0(%1, %6) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> + %8 = "mhlo.dot_general"(%7, %2) {dot_dimension_numbers = #mhlo.dot} : (tensor<2x20xf32>, tensor<20x20xf32>) -> tensor<2x20xf32> + %9 = call @Unknown0(%3, %8) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> + %10 = "mhlo.dot_general"(%9, %4) {dot_dimension_numbers = #mhlo.dot} : (tensor<2x20xf32>, tensor<10x20xf32>) -> tensor<2x10xf32> + %11 = call @Unknown2(%5, %10) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32> return %11 : tensor<2x10xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/3_byre_tensor_opt.mlir b/compiler/test/E2E/MLPInference/3_byre_tensor_opt.mlir index 0c7e3d79e..a4a2b2bdb 100644 --- a/compiler/test/E2E/MLPInference/3_byre_tensor_opt.mlir +++ b/compiler/test/E2E/MLPInference/3_byre_tensor_opt.mlir @@ -2,53 +2,69 @@ // CHECK-LABEL: func.func @forward -#map = affine_map<(d0, d1) -> (d0, d1)> -#map1 = affine_map<(d0, d1) -> (d1)> +#map = affine_map<() -> ()> module attributes {torch.debug_module_name = "GraphModule"} { func.func private @Unknown0(%arg0: tensor<20xf32>, %arg1: tensor<2x20xf32>) -> tensor<2x20xf32> attributes {__byteir_elementwise_fusion__} { + %c20 = arith.constant 20 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<2x20xf32> - %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x20xf32>, tensor<20xf32>) outs(%0 : tensor<2x20xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %2 = arith.addf %in, %in_0 : f32 - %3 = arith.maxnumf %2, %cst : f32 - linalg.yield %3 : f32 - } -> tensor<2x20xf32> - return %1 : tensor<2x20xf32> - } - func.func private @Unknown1(%arg0: tensor<20xf32>, %arg1: tensor<2x20xf32>) -> tensor<2x20xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty() : tensor<2x20xf32> - %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x20xf32>, tensor<20xf32>) outs(%0 : tensor<2x20xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %2 = arith.addf %in, %in_0 : f32 - %3 = arith.maxnumf %2, %cst : f32 - linalg.yield %3 : f32 - } -> tensor<2x20xf32> + %1 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %0) -> (tensor<2x20xf32>) { + %2 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %arg3) -> (tensor<2x20xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<20xf32> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<2x20xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %5 = arith.addf %in_1, %in : f32 + %6 = arith.maximumf %5, %cst : f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor into tensor<2x20xf32> + scf.yield %inserted_slice : tensor<2x20xf32> + } + scf.yield %2 : tensor<2x20xf32> + } return %1 : tensor<2x20xf32> } func.func private @Unknown2(%arg0: tensor<10xf32>, %arg1: tensor<2x10xf32>) -> tensor<2x10xf32> attributes {__byteir_elementwise_fusion__} { + %c10 = arith.constant 10 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<2x10xf32> - %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x10xf32>, tensor<10xf32>) outs(%0 : tensor<2x10xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %2 = arith.addf %in, %in_0 : f32 - linalg.yield %2 : f32 - } -> tensor<2x10xf32> + %1 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %0) -> (tensor<2x10xf32>) { + %2 = scf.for %arg4 = %c0 to %c10 step %c1 iter_args(%arg5 = %arg3) -> (tensor<2x10xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<10xf32> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<2x10xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %5 = arith.addf %in_1, %in : f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor into tensor<2x10xf32> + scf.yield %inserted_slice : tensor<2x10xf32> + } + scf.yield %2 : tensor<2x10xf32> + } return %1 : tensor<2x10xf32> } func.func @forward(%arg0: tensor<2x10xf32>) -> tensor<2x10xf32> { - %0 = mhlo.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32> - %1 = mhlo.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32> - %2 = mhlo.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32> - %3 = mhlo.constant dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32> - %4 = mhlo.constant dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32> - %5 = mhlo.constant dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32> - %6 = "mhlo.dot"(%arg0, %5) : (tensor<2x10xf32>, tensor<10x20xf32>) -> tensor<2x20xf32> - %7 = call @Unknown0(%0, %6) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> - %8 = "mhlo.dot"(%7, %4) : (tensor<2x20xf32>, tensor<20x20xf32>) -> tensor<2x20xf32> - %9 = call @Unknown1(%1, %8) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> - %10 = "mhlo.dot"(%9, %3) : (tensor<2x20xf32>, tensor<20x10xf32>) -> tensor<2x10xf32> - %11 = call @Unknown2(%2, %10) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32> + %0 = mhlo.constant dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32> + %1 = mhlo.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32> + %2 = mhlo.constant dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32> + %3 = mhlo.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32> + %4 = mhlo.constant dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32> + %5 = mhlo.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32> + %6 = "mhlo.dot_general"(%arg0, %0) {dot_dimension_numbers = #mhlo.dot} : (tensor<2x10xf32>, tensor<20x10xf32>) -> tensor<2x20xf32> + %7 = call @Unknown0(%1, %6) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> + %8 = "mhlo.dot_general"(%7, %2) {dot_dimension_numbers = #mhlo.dot} : (tensor<2x20xf32>, tensor<20x20xf32>) -> tensor<2x20xf32> + %9 = call @Unknown0(%3, %8) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> + %10 = "mhlo.dot_general"(%9, %4) {dot_dimension_numbers = #mhlo.dot} : (tensor<2x20xf32>, tensor<10x20xf32>) -> tensor<2x10xf32> + %11 = call @Unknown2(%5, %10) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32> return %11 : tensor<2x10xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/4_bufferize_opt.mlir b/compiler/test/E2E/MLPInference/4_bufferize_opt.mlir index e204402b4..d9816f589 100644 --- a/compiler/test/E2E/MLPInference/4_bufferize_opt.mlir +++ b/compiler/test/E2E/MLPInference/4_bufferize_opt.mlir @@ -2,56 +2,72 @@ // CHECK-LABEL: func.func @forward -#map = affine_map<(d0, d1) -> (d0, d1)> -#map1 = affine_map<(d0, d1) -> (d1)> +#map = affine_map<() -> ()> module attributes {torch.debug_module_name = "GraphModule"} { func.func private @Unknown0(%arg0: tensor<20xf32>, %arg1: tensor<2x20xf32>) -> tensor<2x20xf32> attributes {__byteir_elementwise_fusion__} { + %c20 = arith.constant 20 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<2x20xf32> - %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x20xf32>, tensor<20xf32>) outs(%0 : tensor<2x20xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %2 = arith.addf %in, %in_0 : f32 - %3 = arith.maxnumf %2, %cst : f32 - linalg.yield %3 : f32 - } -> tensor<2x20xf32> - return %1 : tensor<2x20xf32> - } - func.func private @Unknown1(%arg0: tensor<20xf32>, %arg1: tensor<2x20xf32>) -> tensor<2x20xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty() : tensor<2x20xf32> - %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x20xf32>, tensor<20xf32>) outs(%0 : tensor<2x20xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %2 = arith.addf %in, %in_0 : f32 - %3 = arith.maxnumf %2, %cst : f32 - linalg.yield %3 : f32 - } -> tensor<2x20xf32> + %1 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %0) -> (tensor<2x20xf32>) { + %2 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %arg3) -> (tensor<2x20xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<20xf32> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<2x20xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %5 = arith.addf %in_1, %in : f32 + %6 = arith.maximumf %5, %cst : f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor into tensor<2x20xf32> + scf.yield %inserted_slice : tensor<2x20xf32> + } + scf.yield %2 : tensor<2x20xf32> + } return %1 : tensor<2x20xf32> } func.func private @Unknown2(%arg0: tensor<10xf32>, %arg1: tensor<2x10xf32>) -> tensor<2x10xf32> attributes {__byteir_elementwise_fusion__} { + %c10 = arith.constant 10 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<2x10xf32> - %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x10xf32>, tensor<10xf32>) outs(%0 : tensor<2x10xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %2 = arith.addf %in, %in_0 : f32 - linalg.yield %2 : f32 - } -> tensor<2x10xf32> + %1 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %0) -> (tensor<2x10xf32>) { + %2 = scf.for %arg4 = %c0 to %c10 step %c1 iter_args(%arg5 = %arg3) -> (tensor<2x10xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<10xf32> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<2x10xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %5 = arith.addf %in_1, %in : f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor into tensor<2x10xf32> + scf.yield %inserted_slice : tensor<2x10xf32> + } + scf.yield %2 : tensor<2x10xf32> + } return %1 : tensor<2x10xf32> } func.func @forward(%arg0: tensor<2x10xf32>) -> tensor<2x10xf32> attributes {__placeholder__byre.entry_point} { - %cst = arith.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32> - %cst_0 = arith.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32> - %cst_1 = arith.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32> - %cst_2 = arith.constant dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32> - %cst_3 = arith.constant dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32> - %cst_4 = arith.constant dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32> + %cst = arith.constant dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32> + %cst_0 = arith.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32> + %cst_1 = arith.constant dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32> + %cst_2 = arith.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32> + %cst_3 = arith.constant dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32> + %cst_4 = arith.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32> %0 = tensor.empty() : tensor<2x20xf32> - %1 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%arg0, %cst_4 : tensor<2x10xf32>, tensor<10x20xf32>) outs(%0 : tensor<2x20xf32>) : tensor<2x20xf32> - %2 = call @Unknown0(%cst, %1) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> + %1 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%arg0, %cst : tensor<2x10xf32>, tensor<20x10xf32>) outs(%0 : tensor<2x20xf32>) : tensor<2x20xf32> + %2 = call @Unknown0(%cst_0, %1) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> %3 = tensor.empty() : tensor<2x20xf32> - %4 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%2, %cst_3 : tensor<2x20xf32>, tensor<20x20xf32>) outs(%3 : tensor<2x20xf32>) : tensor<2x20xf32> - %5 = call @Unknown1(%cst_0, %4) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> + %4 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%2, %cst_1 : tensor<2x20xf32>, tensor<20x20xf32>) outs(%3 : tensor<2x20xf32>) : tensor<2x20xf32> + %5 = call @Unknown0(%cst_2, %4) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32> %6 = tensor.empty() : tensor<2x10xf32> - %7 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%5, %cst_2 : tensor<2x20xf32>, tensor<20x10xf32>) outs(%6 : tensor<2x10xf32>) : tensor<2x10xf32> - %8 = call @Unknown2(%cst_1, %7) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32> + %7 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%5, %cst_3 : tensor<2x20xf32>, tensor<10x20xf32>) outs(%6 : tensor<2x10xf32>) : tensor<2x10xf32> + %8 = call @Unknown2(%cst_4, %7) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32> return %8 : tensor<2x10xf32> } -} +} \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/5_affine_opt.mlir b/compiler/test/E2E/MLPInference/5_affine_opt.mlir index aeb9723dd..4c8f4570e 100644 --- a/compiler/test/E2E/MLPInference/5_affine_opt.mlir +++ b/compiler/test/E2E/MLPInference/5_affine_opt.mlir @@ -2,62 +2,72 @@ // CHECK-LABEL: func.func @forward -#map = affine_map<(d0, d1) -> (d0, d1)> -#map1 = affine_map<(d0, d1) -> (d1)> +#map = affine_map<() -> ()> module attributes {torch.debug_module_name = "GraphModule"} { - memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> - memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> - memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> memref.global "private" constant @__constant_10xf32 : memref<10xf32> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> + memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> memref.global "private" constant @__constant_20xf32_0 : memref<20xf32> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> + memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> memref.global "private" constant @__constant_20xf32 : memref<20xf32> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> + memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> func.func private @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c20 = arith.constant 20 : index %alloc = memref.alloc() : memref<2x20xf32> - linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x20xf32>, memref<20xf32>) outs(%alloc : memref<2x20xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %0 = arith.addf %in, %in_0 : f32 - %1 = arith.maxnumf %0, %cst : f32 - linalg.yield %1 : f32 - } - return %alloc : memref<2x20xf32> - } - func.func private @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f32 - %alloc = memref.alloc() : memref<2x20xf32> - linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x20xf32>, memref<20xf32>) outs(%alloc : memref<2x20xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %0 = arith.addf %in, %in_0 : f32 - %1 = arith.maxnumf %0, %cst : f32 - linalg.yield %1 : f32 + scf.for %arg2 = %c0 to %c2 step %c1 { + scf.for %arg3 = %c0 to %c20 step %c1 { + %subview = memref.subview %arg0[%arg3] [1] [1] : memref<20xf32> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<2x20xf32> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<2x20xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %0 = arith.addf %in_2, %in : f32 + %1 = arith.maximumf %0, %cst : f32 + linalg.yield %1 : f32 + } + } } return %alloc : memref<2x20xf32> } func.func private @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>) -> memref<2x10xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index %alloc = memref.alloc() : memref<2x10xf32> - linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x10xf32>, memref<10xf32>) outs(%alloc : memref<2x10xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %0 = arith.addf %in, %in_0 : f32 - linalg.yield %0 : f32 + scf.for %arg2 = %c0 to %c2 step %c1 { + scf.for %arg3 = %c0 to %c10 step %c1 { + %subview = memref.subview %arg0[%arg3] [1] [1] : memref<10xf32> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<2x10xf32> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<2x10xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %0 = arith.addf %in_2, %in : f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<2x10xf32> } func.func @forward(%arg0: memref<2x10xf32>) -> memref<2x10xf32> attributes {__placeholder__byre.entry_point} { - %0 = memref.get_global @__constant_20xf32 : memref<20xf32> - %1 = memref.get_global @__constant_20xf32_0 : memref<20xf32> - %2 = memref.get_global @__constant_10xf32 : memref<10xf32> - %3 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32> - %4 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32> - %5 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32> + %0 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32> + %1 = memref.get_global @__constant_20xf32 : memref<20xf32> + %2 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32> + %3 = memref.get_global @__constant_20xf32_0 : memref<20xf32> + %4 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32> + %5 = memref.get_global @__constant_10xf32 : memref<10xf32> %alloc = memref.alloc() : memref<2x20xf32> - byre.compute @MatmulOp_f32f32_f32(%arg0, %5, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32>, memref<10x20xf32>, memref<2x20xf32> - %6 = call @Unknown0(%0, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> + byre.compute @MatmulOp_f32f32_f32(%arg0, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32>, memref<20x10xf32>, memref<2x20xf32> + %6 = call @Unknown0(%1, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> %alloc_0 = memref.alloc() : memref<2x20xf32> - byre.compute @MatmulOp_f32f32_f32(%6, %4, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32> - %7 = call @Unknown1(%1, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> + byre.compute @MatmulOp_f32f32_f32(%6, %2, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32> + %7 = call @Unknown0(%3, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> %alloc_1 = memref.alloc() : memref<2x10xf32> - byre.compute @MatmulOp_f32f32_f32(%7, %3, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x10xf32>, memref<2x10xf32> - %8 = call @Unknown2(%2, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32> + byre.compute @MatmulOp_f32f32_f32(%7, %4, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<10x20xf32>, memref<2x10xf32> + %8 = call @Unknown2(%5, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32> return %8 : memref<2x10xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/5_alternative_scf_opt.mlir b/compiler/test/E2E/MLPInference/5_alternative_scf_opt.mlir index 08dda3cc8..6e8595988 100644 --- a/compiler/test/E2E/MLPInference/5_alternative_scf_opt.mlir +++ b/compiler/test/E2E/MLPInference/5_alternative_scf_opt.mlir @@ -2,62 +2,72 @@ // CHECK-LABEL: func.func @forward -#map = affine_map<(d0, d1) -> (d0, d1)> -#map1 = affine_map<(d0, d1) -> (d1)> +#map = affine_map<() -> ()> module attributes {torch.debug_module_name = "GraphModule"} { - memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> - memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> - memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> memref.global "private" constant @__constant_10xf32 : memref<10xf32> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> + memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> memref.global "private" constant @__constant_20xf32_0 : memref<20xf32> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> + memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> memref.global "private" constant @__constant_20xf32 : memref<20xf32> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> + memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> func.func private @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c20 = arith.constant 20 : index %alloc = memref.alloc() : memref<2x20xf32> - linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x20xf32>, memref<20xf32>) outs(%alloc : memref<2x20xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %0 = arith.addf %in, %in_0 : f32 - %1 = arith.maxnumf %0, %cst : f32 - linalg.yield %1 : f32 - } - return %alloc : memref<2x20xf32> - } - func.func private @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f32 - %alloc = memref.alloc() : memref<2x20xf32> - linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x20xf32>, memref<20xf32>) outs(%alloc : memref<2x20xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %0 = arith.addf %in, %in_0 : f32 - %1 = arith.maxnumf %0, %cst : f32 - linalg.yield %1 : f32 + scf.for %arg2 = %c0 to %c2 step %c1 { + scf.for %arg3 = %c0 to %c20 step %c1 { + %subview = memref.subview %arg0[%arg3] [1] [1] : memref<20xf32> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<2x20xf32> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<2x20xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %0 = arith.addf %in_2, %in : f32 + %1 = arith.maximumf %0, %cst : f32 + linalg.yield %1 : f32 + } + } } return %alloc : memref<2x20xf32> } func.func private @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>) -> memref<2x10xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index %alloc = memref.alloc() : memref<2x10xf32> - linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x10xf32>, memref<10xf32>) outs(%alloc : memref<2x10xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %0 = arith.addf %in, %in_0 : f32 - linalg.yield %0 : f32 + scf.for %arg2 = %c0 to %c2 step %c1 { + scf.for %arg3 = %c0 to %c10 step %c1 { + %subview = memref.subview %arg0[%arg3] [1] [1] : memref<10xf32> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<2x10xf32> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<2x10xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %0 = arith.addf %in_2, %in : f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<2x10xf32> } func.func @forward(%arg0: memref<2x10xf32>) -> memref<2x10xf32> attributes {__placeholder__byre.entry_point} { - %0 = memref.get_global @__constant_20xf32 : memref<20xf32> - %1 = memref.get_global @__constant_20xf32_0 : memref<20xf32> - %2 = memref.get_global @__constant_10xf32 : memref<10xf32> - %3 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32> - %4 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32> - %5 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32> + %0 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32> + %1 = memref.get_global @__constant_20xf32 : memref<20xf32> + %2 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32> + %3 = memref.get_global @__constant_20xf32_0 : memref<20xf32> + %4 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32> + %5 = memref.get_global @__constant_10xf32 : memref<10xf32> %alloc = memref.alloc() : memref<2x20xf32> - byre.compute @MatmulOp_f32f32_f32(%arg0, %5, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32>, memref<10x20xf32>, memref<2x20xf32> - %6 = call @Unknown0(%0, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> + byre.compute @MatmulOp_f32f32_f32(%arg0, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32>, memref<20x10xf32>, memref<2x20xf32> + %6 = call @Unknown0(%1, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> %alloc_0 = memref.alloc() : memref<2x20xf32> - byre.compute @MatmulOp_f32f32_f32(%6, %4, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32> - %7 = call @Unknown1(%1, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> + byre.compute @MatmulOp_f32f32_f32(%6, %2, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32> + %7 = call @Unknown0(%3, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> %alloc_1 = memref.alloc() : memref<2x10xf32> - byre.compute @MatmulOp_f32f32_f32(%7, %3, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x10xf32>, memref<2x10xf32> - %8 = call @Unknown2(%2, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32> + byre.compute @MatmulOp_f32f32_f32(%7, %4, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<10x20xf32>, memref<2x10xf32> + %8 = call @Unknown2(%5, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32> return %8 : memref<2x10xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/6_gpu_opt.mlir b/compiler/test/E2E/MLPInference/6_gpu_opt.mlir index a3052a076..1cfadee5a 100644 --- a/compiler/test/E2E/MLPInference/6_gpu_opt.mlir +++ b/compiler/test/E2E/MLPInference/6_gpu_opt.mlir @@ -3,107 +3,62 @@ // CHECK-LABEL: func.func @forward module attributes {torch.debug_module_name = "GraphModule"} { - memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> - memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> - memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> memref.global "private" constant @__constant_10xf32 : memref<10xf32> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> + memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> memref.global "private" constant @__constant_20xf32_0 : memref<20xf32> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> + memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> memref.global "private" constant @__constant_20xf32 : memref<20xf32> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> + memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> func.func private @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c40 = arith.constant 40 : index - %c1 = arith.constant 1 : index %c20 = arith.constant 20 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<2x20xf32> - scf.for %arg2 = %c0 to %c40 step %c1 { - %0 = arith.remsi %arg2, %c20 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c20 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c20 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg1[%9, %3] : memref<2x20xf32> - %11 = memref.load %arg0[%3] : memref<20xf32> - %12 = arith.addf %10, %11 : f32 - %13 = arith.maxnumf %12, %cst : f32 - memref.store %13, %alloc[%9, %3] : memref<2x20xf32> - } - return %alloc : memref<2x20xf32> - } - func.func private @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f32 + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 %c40 = arith.constant 40 : index - %c1 = arith.constant 1 : index - %c20 = arith.constant 20 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<2x20xf32> scf.for %arg2 = %c0 to %c40 step %c1 { %0 = arith.remsi %arg2, %c20 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c20 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c20 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg1[%9, %3] : memref<2x20xf32> - %11 = memref.load %arg0[%3] : memref<20xf32> - %12 = arith.addf %10, %11 : f32 - %13 = arith.maxnumf %12, %cst : f32 - memref.store %13, %alloc[%9, %3] : memref<2x20xf32> + %1 = arith.divsi %arg2, %c20 : index + %2 = memref.load %arg0[%0] : memref<20xf32> + %3 = memref.load %arg1[%1, %0] : memref<2x20xf32> + %4 = arith.addf %3, %2 : f32 + %5 = arith.maximumf %4, %cst : f32 + memref.store %5, %alloc[%1, %0] : memref<2x20xf32> } return %alloc : memref<2x20xf32> } func.func private @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>) -> memref<2x10xf32> attributes {__byteir_elementwise_fusion__} { + %c10 = arith.constant 10 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c20 = arith.constant 20 : index - %c1 = arith.constant 1 : index - %c10 = arith.constant 10 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<2x10xf32> scf.for %arg2 = %c0 to %c20 step %c1 { %0 = arith.remsi %arg2, %c10 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c10 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c10 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg1[%9, %3] : memref<2x10xf32> - %11 = memref.load %arg0[%3] : memref<10xf32> - %12 = arith.addf %10, %11 : f32 - memref.store %12, %alloc[%9, %3] : memref<2x10xf32> + %1 = arith.divsi %arg2, %c10 : index + %2 = memref.load %arg0[%0] : memref<10xf32> + %3 = memref.load %arg1[%1, %0] : memref<2x10xf32> + %4 = arith.addf %3, %2 : f32 + memref.store %4, %alloc[%1, %0] : memref<2x10xf32> } return %alloc : memref<2x10xf32> } func.func @forward(%arg0: memref<2x10xf32>) -> memref<2x10xf32> attributes {__placeholder__byre.entry_point} { - %0 = memref.get_global @__constant_20xf32 : memref<20xf32> - %1 = memref.get_global @__constant_20xf32_0 : memref<20xf32> - %2 = memref.get_global @__constant_10xf32 : memref<10xf32> - %3 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32> - %4 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32> - %5 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32> + %0 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32> + %1 = memref.get_global @__constant_20xf32 : memref<20xf32> + %2 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32> + %3 = memref.get_global @__constant_20xf32_0 : memref<20xf32> + %4 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32> + %5 = memref.get_global @__constant_10xf32 : memref<10xf32> %alloc = memref.alloc() : memref<2x20xf32> - byre.compute @MatmulOp_f32f32_f32(%arg0, %5, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32>, memref<10x20xf32>, memref<2x20xf32> - %6 = call @Unknown0(%0, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> + byre.compute @MatmulOp_f32f32_f32(%arg0, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32>, memref<20x10xf32>, memref<2x20xf32> + %6 = call @Unknown0(%1, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> %alloc_0 = memref.alloc() : memref<2x20xf32> - byre.compute @MatmulOp_f32f32_f32(%6, %4, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32> - %7 = call @Unknown1(%1, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> + byre.compute @MatmulOp_f32f32_f32(%6, %2, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32> + %7 = call @Unknown0(%3, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> %alloc_1 = memref.alloc() : memref<2x10xf32> - byre.compute @MatmulOp_f32f32_f32(%7, %3, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x10xf32>, memref<2x10xf32> - %8 = call @Unknown2(%2, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32> + byre.compute @MatmulOp_f32f32_f32(%7, %4, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<10x20xf32>, memref<2x10xf32> + %8 = call @Unknown2(%5, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32> return %8 : memref<2x10xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/7_set_space_opt.mlir b/compiler/test/E2E/MLPInference/7_set_space_opt.mlir index 5e9cb9cbd..5f0671566 100644 --- a/compiler/test/E2E/MLPInference/7_set_space_opt.mlir +++ b/compiler/test/E2E/MLPInference/7_set_space_opt.mlir @@ -1,144 +1,88 @@ -// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -set-op-space="entry-func=forward space=cuda" -set-arg-space="entry-func=forward all-space=cuda" | FileCheck %s +// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -inline -gpu-launch-func-to-byre -set-op-space="entry-func=forward space=cuda" -set-arg-space="entry-func=forward all-space=cuda" | FileCheck %s // CHECK-LABEL: func.func @forward module attributes {gpu.container_module, torch.debug_module_name = "GraphModule"} { gpu.module @unified { gpu.func @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>, %arg2: memref<2x10xf32>) kernel { - %c0 = arith.constant 0 : index %c20 = arith.constant 20 : index %c10 = arith.constant 10 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c20 : index - scf.if %5 { - %6 = arith.remsi %4, %c10 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c10 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c10 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x10xf32> - %17 = memref.load %arg0[%9] : memref<10xf32> - %18 = arith.addf %16, %17 : f32 - memref.store %18, %arg2[%15, %9] : memref<2x10xf32> - } - gpu.return - } - gpu.func @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c40 = arith.constant 40 : index - %c20 = arith.constant 20 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c40 : index - scf.if %5 { - %6 = arith.remsi %4, %c20 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c20 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c20 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x20xf32> - %17 = memref.load %arg0[%9] : memref<20xf32> - %18 = arith.addf %16, %17 : f32 - %19 = arith.maxnumf %18, %cst : f32 - memref.store %19, %arg2[%15, %9] : memref<2x20xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c20 step %6 { + %7 = arith.remsi %arg3, %c10 : index + %8 = arith.divsi %arg3, %c10 : index + %9 = memref.load %arg0[%7] : memref<10xf32> + %10 = memref.load %arg1[%8, %7] : memref<2x10xf32> + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%8, %7] : memref<2x10xf32> } gpu.return } gpu.func @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index %c40 = arith.constant 40 : index + %cst = arith.constant 0.000000e+00 : f32 %c20 = arith.constant 20 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c40 : index - scf.if %5 { - %6 = arith.remsi %4, %c20 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c20 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c20 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x20xf32> - %17 = memref.load %arg0[%9] : memref<20xf32> - %18 = arith.addf %16, %17 : f32 - %19 = arith.maxnumf %18, %cst : f32 - memref.store %19, %arg2[%15, %9] : memref<2x20xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c40 step %6 { + %7 = arith.remsi %arg3, %c20 : index + %8 = arith.divsi %arg3, %c20 : index + %9 = memref.load %arg0[%7] : memref<20xf32> + %10 = memref.load %arg1[%8, %7] : memref<2x20xf32> + %11 = arith.addf %10, %9 : f32 + %12 = arith.maximumf %11, %cst : f32 + memref.store %12, %arg2[%8, %7] : memref<2x20xf32> } gpu.return } } - memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> - memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> - memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> memref.global "private" constant @__constant_10xf32 : memref<10xf32> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> + memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> memref.global "private" constant @__constant_20xf32_0 : memref<20xf32> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> + memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> memref.global "private" constant @__constant_20xf32 : memref<20xf32> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> - func.func private @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<2x20xf32> - gpu.launch_func @unified::@Unknown0 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<20xf32>, %arg1 : memref<2x20xf32>, %alloc : memref<2x20xf32>) - return %alloc : memref<2x20xf32> - } - func.func private @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> + func.func private @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<2x20xf32> - gpu.launch_func @unified::@Unknown1 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<20xf32>, %arg1 : memref<2x20xf32>, %alloc : memref<2x20xf32>) + gpu.launch_func @unified::@Unknown0 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<20xf32>, %arg1 : memref<2x20xf32>, %alloc : memref<2x20xf32>) return %alloc : memref<2x20xf32> } - func.func private @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>) -> memref<2x10xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown2", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>) -> memref<2x10xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown2", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<2x10xf32> - gpu.launch_func @unified::@Unknown2 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<10xf32>, %arg1 : memref<2x10xf32>, %alloc : memref<2x10xf32>) + gpu.launch_func @unified::@Unknown2 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<10xf32>, %arg1 : memref<2x10xf32>, %alloc : memref<2x10xf32>) return %alloc : memref<2x10xf32> } func.func @forward(%arg0: memref<2x10xf32>) -> memref<2x10xf32> attributes {__placeholder__byre.entry_point} { - %0 = memref.get_global @__constant_20xf32 : memref<20xf32> - %1 = memref.get_global @__constant_20xf32_0 : memref<20xf32> - %2 = memref.get_global @__constant_10xf32 : memref<10xf32> - %3 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32> - %4 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32> - %5 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32> + %0 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32> + %1 = memref.get_global @__constant_20xf32 : memref<20xf32> + %2 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32> + %3 = memref.get_global @__constant_20xf32_0 : memref<20xf32> + %4 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32> + %5 = memref.get_global @__constant_10xf32 : memref<10xf32> %alloc = memref.alloc() : memref<2x20xf32> - byre.compute @MatmulOp_f32f32_f32(%arg0, %5, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32>, memref<10x20xf32>, memref<2x20xf32> - %6 = call @Unknown0(%0, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> + byre.compute @MatmulOp_f32f32_f32(%arg0, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32>, memref<20x10xf32>, memref<2x20xf32> + %6 = call @Unknown0(%1, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> %alloc_0 = memref.alloc() : memref<2x20xf32> - byre.compute @MatmulOp_f32f32_f32(%6, %4, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32> - %7 = call @Unknown1(%1, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> + byre.compute @MatmulOp_f32f32_f32(%6, %2, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32> + %7 = call @Unknown0(%3, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32> %alloc_1 = memref.alloc() : memref<2x10xf32> - byre.compute @MatmulOp_f32f32_f32(%7, %3, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x10xf32>, memref<2x10xf32> - %8 = call @Unknown2(%2, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32> + byre.compute @MatmulOp_f32f32_f32(%7, %4, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<10x20xf32>, memref<2x10xf32> + %8 = call @Unknown2(%5, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32> return %8 : memref<2x10xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/8_byre_opt.mlir b/compiler/test/E2E/MLPInference/8_byre_opt.mlir index a3557aabd..549239c4e 100644 --- a/compiler/test/E2E/MLPInference/8_byre_opt.mlir +++ b/compiler/test/E2E/MLPInference/8_byre_opt.mlir @@ -5,122 +5,72 @@ module attributes {gpu.container_module, torch.debug_module_name = "GraphModule"} { gpu.module @unified { gpu.func @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>, %arg2: memref<2x10xf32>) kernel { - %c0 = arith.constant 0 : index %c20 = arith.constant 20 : index %c10 = arith.constant 10 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c20 : index - scf.if %5 { - %6 = arith.remsi %4, %c10 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c10 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c10 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x10xf32> - %17 = memref.load %arg0[%9] : memref<10xf32> - %18 = arith.addf %16, %17 : f32 - memref.store %18, %arg2[%15, %9] : memref<2x10xf32> - } - gpu.return - } - gpu.func @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c40 = arith.constant 40 : index - %c20 = arith.constant 20 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c40 : index - scf.if %5 { - %6 = arith.remsi %4, %c20 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c20 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c20 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x20xf32> - %17 = memref.load %arg0[%9] : memref<20xf32> - %18 = arith.addf %16, %17 : f32 - %19 = arith.maxnumf %18, %cst : f32 - memref.store %19, %arg2[%15, %9] : memref<2x20xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c20 step %6 { + %7 = arith.remsi %arg3, %c10 : index + %8 = arith.divsi %arg3, %c10 : index + %9 = memref.load %arg0[%7] : memref<10xf32> + %10 = memref.load %arg1[%8, %7] : memref<2x10xf32> + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%8, %7] : memref<2x10xf32> } gpu.return } gpu.func @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index %c40 = arith.constant 40 : index + %cst = arith.constant 0.000000e+00 : f32 %c20 = arith.constant 20 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c40 : index - scf.if %5 { - %6 = arith.remsi %4, %c20 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c20 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c20 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x20xf32> - %17 = memref.load %arg0[%9] : memref<20xf32> - %18 = arith.addf %16, %17 : f32 - %19 = arith.maxnumf %18, %cst : f32 - memref.store %19, %arg2[%15, %9] : memref<2x20xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c40 step %6 { + %7 = arith.remsi %arg3, %c20 : index + %8 = arith.divsi %arg3, %c20 : index + %9 = memref.load %arg0[%7] : memref<20xf32> + %10 = memref.load %arg1[%8, %7] : memref<2x20xf32> + %11 = arith.addf %10, %9 : f32 + %12 = arith.maximumf %11, %cst : f32 + memref.store %12, %arg2[%8, %7] : memref<2x20xf32> } gpu.return } } - func.func private @Unknown0(memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown1(memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown2(memref<10xf32, "cuda">, memref<2x10xf32, "cuda">) -> memref<2x10xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown2", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown0(memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown2(memref<10xf32, "cuda">, memref<2x10xf32, "cuda">) -> memref<2x10xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown2", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} func.func @forward(%arg0: memref<2x10xf32, "cuda">) -> memref<2x10xf32, "cuda"> attributes {__placeholder__byre.entry_point} { - %0 = memref.get_global @__constant_20xf32_cuda : memref<20xf32, "cuda"> {device = "cuda"} - %1 = memref.get_global @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> {device = "cuda"} - %2 = memref.get_global @__constant_10xf32_cuda : memref<10xf32, "cuda"> {device = "cuda"} - %3 = memref.get_global @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> {device = "cuda"} - %4 = memref.get_global @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> {device = "cuda"} - %5 = memref.get_global @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> {device = "cuda"} + %0 = memref.get_global @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> {device = "cuda"} + %1 = memref.get_global @__constant_20xf32_cuda : memref<20xf32, "cuda"> {device = "cuda"} + %2 = memref.get_global @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> {device = "cuda"} + %3 = memref.get_global @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> {device = "cuda"} + %4 = memref.get_global @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> {device = "cuda"} + %5 = memref.get_global @__constant_10xf32_cuda : memref<10xf32, "cuda"> {device = "cuda"} %alloc = memref.alloc() : memref<2x20xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%arg0, %5, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x20xf32, "cuda"> - %6 = call @Unknown0(%0, %alloc) : (memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%arg0, %0, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x20xf32, "cuda"> + %6 = call @Unknown0(%1, %alloc) : (memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda"> %alloc_0 = memref.alloc() : memref<2x20xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%6, %4, %alloc_0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda"> - %7 = call @Unknown1(%1, %alloc_0) : (memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%6, %2, %alloc_0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda"> + %7 = call @Unknown0(%3, %alloc_0) : (memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda"> %alloc_1 = memref.alloc() : memref<2x10xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%7, %3, %alloc_1) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x10xf32, "cuda"> - %8 = call @Unknown2(%2, %alloc_1) : (memref<10xf32, "cuda">, memref<2x10xf32, "cuda">) -> memref<2x10xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%7, %4, %alloc_1) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x10xf32, "cuda"> + %8 = call @Unknown2(%5, %alloc_1) : (memref<10xf32, "cuda">, memref<2x10xf32, "cuda">) -> memref<2x10xf32, "cuda"> return %8 : memref<2x10xf32, "cuda"> } - memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> - memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> - memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> memref.global "private" constant @__constant_10xf32_cuda : memref<10xf32, "cuda"> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> + memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> memref.global "private" constant @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> + memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> memref.global "private" constant @__constant_20xf32_cuda : memref<20xf32, "cuda"> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> + memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> } \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/9a_byre_host.mlir b/compiler/test/E2E/MLPInference/9a_byre_host.mlir index 68f78a4cb..2e66696ba 100644 --- a/compiler/test/E2E/MLPInference/9a_byre_host.mlir +++ b/compiler/test/E2E/MLPInference/9a_byre_host.mlir @@ -5,126 +5,65 @@ module attributes {byre.container_module, gpu.container_module, torch.debug_module_name = "GraphModule"} { gpu.module @unified { gpu.func @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>, %arg2: memref<2x10xf32>) kernel { - %c0 = arith.constant 0 : index %c20 = arith.constant 20 : index %c10 = arith.constant 10 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c20 : index - scf.if %5 { - %6 = arith.remsi %4, %c10 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c10 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c10 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x10xf32> - %17 = memref.load %arg0[%9] : memref<10xf32> - %18 = arith.addf %16, %17 : f32 - memref.store %18, %arg2[%15, %9] : memref<2x10xf32> - } - gpu.return - } - gpu.func @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c40 = arith.constant 40 : index - %c20 = arith.constant 20 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c40 : index - scf.if %5 { - %6 = arith.remsi %4, %c20 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c20 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c20 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x20xf32> - %17 = memref.load %arg0[%9] : memref<20xf32> - %18 = arith.addf %16, %17 : f32 - %19 = arith.maxnumf %18, %cst : f32 - memref.store %19, %arg2[%15, %9] : memref<2x20xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c20 step %6 { + %7 = arith.remsi %arg3, %c10 : index + %8 = arith.divsi %arg3, %c10 : index + %9 = memref.load %arg0[%7] : memref<10xf32> + %10 = memref.load %arg1[%8, %7] : memref<2x10xf32> + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%8, %7] : memref<2x10xf32> } gpu.return } gpu.func @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index %c40 = arith.constant 40 : index + %cst = arith.constant 0.000000e+00 : f32 %c20 = arith.constant 20 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c40 : index - scf.if %5 { - %6 = arith.remsi %4, %c20 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c20 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c20 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x20xf32> - %17 = memref.load %arg0[%9] : memref<20xf32> - %18 = arith.addf %16, %17 : f32 - %19 = arith.maxnumf %18, %cst : f32 - memref.store %19, %arg2[%15, %9] : memref<2x20xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c40 step %6 { + %7 = arith.remsi %arg3, %c20 : index + %8 = arith.divsi %arg3, %c20 : index + %9 = memref.load %arg0[%7] : memref<20xf32> + %10 = memref.load %arg1[%8, %7] : memref<2x20xf32> + %11 = arith.addf %10, %9 : f32 + %12 = arith.maximumf %11, %cst : f32 + memref.store %12, %arg2[%8, %7] : memref<2x20xf32> } gpu.return } } - func.func @forward(%arg0: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { - %alloc = memref.alloc() : memref<320xi8, "cuda"> - %alloc_0 = memref.alloc() : memref<20xf32, "cuda"> - byre.compute @FillOp(%alloc_0) {memory_effects = [2 : i32], value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>} : memref<20xf32, "cuda"> - %alloc_1 = memref.alloc() : memref<20xf32, "cuda"> - byre.compute @FillOp(%alloc_1) {memory_effects = [2 : i32], value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>} : memref<20xf32, "cuda"> - %alloc_2 = memref.alloc() : memref<10xf32, "cuda"> - byre.compute @FillOp(%alloc_2) {memory_effects = [2 : i32], value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>} : memref<10xf32, "cuda"> - %alloc_3 = memref.alloc() : memref<20x10xf32, "cuda"> - byre.compute @FillOp(%alloc_3) {memory_effects = [2 : i32], value = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32>} : memref<20x10xf32, "cuda"> - %alloc_4 = memref.alloc() : memref<20x20xf32, "cuda"> - byre.compute @FillOp(%alloc_4) {memory_effects = [2 : i32], value = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32>} : memref<20x20xf32, "cuda"> - %alloc_5 = memref.alloc() : memref<10x20xf32, "cuda"> - byre.compute @FillOp(%alloc_5) {memory_effects = [2 : i32], value = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32>} : memref<10x20xf32, "cuda"> - %0 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%arg0, %alloc_5, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x20xf32, "cuda"> - %1 = "byre.alias"(%alloc) {offset = 160 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda"> - byre.compute @PTXOp(%alloc_0, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%1, %alloc_4, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda"> - byre.compute @PTXOp(%alloc_1, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> - %2 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x10xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%1, %alloc_3, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x10xf32, "cuda"> - byre.compute @PTXOp(%alloc_2, %2, %arg1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda"> + func.func @forward(%arg0: memref<20x10xf32, "cuda"> {byre.argname = "Weight0", byre.argtype = 4 : i32, byre.weight_value = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32>}, %arg1: memref<20xf32, "cuda"> {byre.argname = "Weight1", byre.argtype = 4 : i32, byre.weight_value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>}, %arg2: memref<20x20xf32, "cuda"> {byre.argname = "Weight2", byre.argtype = 4 : i32, byre.weight_value = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32>}, %arg3: memref<20xf32, "cuda"> {byre.argname = "Weight3", byre.argtype = 4 : i32, byre.weight_value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>}, %arg4: memref<10x20xf32, "cuda"> {byre.argname = "Weight4", byre.argtype = 4 : i32, byre.weight_value = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32>}, %arg5: memref<10xf32, "cuda"> {byre.argname = "Weight5", byre.argtype = 4 : i32, byre.weight_value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>}, %arg6: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg7: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { + %alloc = memref.alloc() : memref<512xi8, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%arg6, %arg0, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x20xf32, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 256 : i64}> : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda"> + byre.compute @PTXOp(%arg1, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%1, %arg2, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda"> + byre.compute @PTXOp(%arg3, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<512xi8, "cuda">) -> memref<2x10xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%1, %arg4, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x10xf32, "cuda"> + byre.compute @PTXOp(%arg5, %2, %arg7) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda"> return } - memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> - memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> - memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> memref.global "private" constant @__constant_10xf32_cuda : memref<10xf32, "cuda"> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> + memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> memref.global "private" constant @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> + memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> memref.global "private" constant @__constant_20xf32_cuda : memref<20xf32, "cuda"> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> + memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> } \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/9b_nvvm_codegen.mlir b/compiler/test/E2E/MLPInference/9b_nvvm_codegen.mlir index 2e52dd556..bdaa7894b 100644 --- a/compiler/test/E2E/MLPInference/9b_nvvm_codegen.mlir +++ b/compiler/test/E2E/MLPInference/9b_nvvm_codegen.mlir @@ -5,126 +5,65 @@ module attributes {byre.container_module, gpu.container_module, torch.debug_module_name = "GraphModule"} { gpu.module @unified { gpu.func @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>, %arg2: memref<2x10xf32>) kernel { - %c0 = arith.constant 0 : index %c20 = arith.constant 20 : index %c10 = arith.constant 10 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c20 : index - scf.if %5 { - %6 = arith.remsi %4, %c10 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c10 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c10 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x10xf32> - %17 = memref.load %arg0[%9] : memref<10xf32> - %18 = arith.addf %16, %17 : f32 - memref.store %18, %arg2[%15, %9] : memref<2x10xf32> - } - gpu.return - } - gpu.func @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c40 = arith.constant 40 : index - %c20 = arith.constant 20 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c40 : index - scf.if %5 { - %6 = arith.remsi %4, %c20 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c20 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c20 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x20xf32> - %17 = memref.load %arg0[%9] : memref<20xf32> - %18 = arith.addf %16, %17 : f32 - %19 = arith.maxnumf %18, %cst : f32 - memref.store %19, %arg2[%15, %9] : memref<2x20xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c20 step %6 { + %7 = arith.remsi %arg3, %c10 : index + %8 = arith.divsi %arg3, %c10 : index + %9 = memref.load %arg0[%7] : memref<10xf32> + %10 = memref.load %arg1[%8, %7] : memref<2x10xf32> + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%8, %7] : memref<2x10xf32> } gpu.return } gpu.func @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index %c40 = arith.constant 40 : index + %cst = arith.constant 0.000000e+00 : f32 %c20 = arith.constant 20 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c40 : index - scf.if %5 { - %6 = arith.remsi %4, %c20 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c20 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c20 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<2x20xf32> - %17 = memref.load %arg0[%9] : memref<20xf32> - %18 = arith.addf %16, %17 : f32 - %19 = arith.maxnumf %18, %cst : f32 - memref.store %19, %arg2[%15, %9] : memref<2x20xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c40 step %6 { + %7 = arith.remsi %arg3, %c20 : index + %8 = arith.divsi %arg3, %c20 : index + %9 = memref.load %arg0[%7] : memref<20xf32> + %10 = memref.load %arg1[%8, %7] : memref<2x20xf32> + %11 = arith.addf %10, %9 : f32 + %12 = arith.maximumf %11, %cst : f32 + memref.store %12, %arg2[%8, %7] : memref<2x20xf32> } gpu.return } } - func.func @forward(%arg0: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { - %alloc = memref.alloc() : memref<320xi8, "cuda"> - %alloc_0 = memref.alloc() : memref<20xf32, "cuda"> - byre.compute @FillOp(%alloc_0) {memory_effects = [2 : i32], value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>} : memref<20xf32, "cuda"> - %alloc_1 = memref.alloc() : memref<20xf32, "cuda"> - byre.compute @FillOp(%alloc_1) {memory_effects = [2 : i32], value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>} : memref<20xf32, "cuda"> - %alloc_2 = memref.alloc() : memref<10xf32, "cuda"> - byre.compute @FillOp(%alloc_2) {memory_effects = [2 : i32], value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>} : memref<10xf32, "cuda"> - %alloc_3 = memref.alloc() : memref<20x10xf32, "cuda"> - byre.compute @FillOp(%alloc_3) {memory_effects = [2 : i32], value = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32>} : memref<20x10xf32, "cuda"> - %alloc_4 = memref.alloc() : memref<20x20xf32, "cuda"> - byre.compute @FillOp(%alloc_4) {memory_effects = [2 : i32], value = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32>} : memref<20x20xf32, "cuda"> - %alloc_5 = memref.alloc() : memref<10x20xf32, "cuda"> - byre.compute @FillOp(%alloc_5) {memory_effects = [2 : i32], value = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32>} : memref<10x20xf32, "cuda"> - %0 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%arg0, %alloc_5, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x20xf32, "cuda"> - %1 = "byre.alias"(%alloc) {offset = 160 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda"> - byre.compute @PTXOp(%alloc_0, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%1, %alloc_4, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda"> - byre.compute @PTXOp(%alloc_1, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> - %2 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x10xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%1, %alloc_3, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x10xf32, "cuda"> - byre.compute @PTXOp(%alloc_2, %2, %arg1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda"> + func.func @forward(%arg0: memref<20x10xf32, "cuda"> {byre.argname = "Weight0", byre.argtype = 4 : i32, byre.weight_value = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32>}, %arg1: memref<20xf32, "cuda"> {byre.argname = "Weight1", byre.argtype = 4 : i32, byre.weight_value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>}, %arg2: memref<20x20xf32, "cuda"> {byre.argname = "Weight2", byre.argtype = 4 : i32, byre.weight_value = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32>}, %arg3: memref<20xf32, "cuda"> {byre.argname = "Weight3", byre.argtype = 4 : i32, byre.weight_value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>}, %arg4: memref<10x20xf32, "cuda"> {byre.argname = "Weight4", byre.argtype = 4 : i32, byre.weight_value = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32>}, %arg5: memref<10xf32, "cuda"> {byre.argname = "Weight5", byre.argtype = 4 : i32, byre.weight_value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>}, %arg6: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg7: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { + %alloc = memref.alloc() : memref<512xi8, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%arg6, %arg0, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x20xf32, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 256 : i64}> : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda"> + byre.compute @PTXOp(%arg1, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%1, %arg2, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda"> + byre.compute @PTXOp(%arg3, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<512xi8, "cuda">) -> memref<2x10xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%1, %arg4, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x10xf32, "cuda"> + byre.compute @PTXOp(%arg5, %2, %arg7) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda"> return } - memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> - memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> - memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> memref.global "private" constant @__constant_10xf32_cuda : memref<10xf32, "cuda"> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> + memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> memref.global "private" constant @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> + memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> memref.global "private" constant @__constant_20xf32_cuda : memref<20xf32, "cuda"> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> + memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> } \ No newline at end of file diff --git a/compiler/test/E2E/MLPInference/device_output.ptx b/compiler/test/E2E/MLPInference/device_output.ptx index 06cdbbcdd..8a25dc663 100644 --- a/compiler/test/E2E/MLPInference/device_output.ptx +++ b/compiler/test/E2E/MLPInference/device_output.ptx @@ -31,126 +31,48 @@ ) { .reg .pred %p<3>; - .reg .b32 %r<4>; + .reg .b32 %r<5>; .reg .f32 %f<4>; - .reg .b64 %rd<32>; + .reg .b64 %rd<29>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 19; - @%p1 bra $L__BB0_2; - ld.param.u64 %rd5, [Unknown2_param_13]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown2_param_1]; - ld.param.u64 %rd7, [Unknown2_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 7378697629483820647; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 2; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 10; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 10; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 7378697629483820647; - shr.u64 %rd21, %rd20, 63; - shr.u64 %rd22, %rd20, 2; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.lo.s64 %rd25, %rd24, 10; - add.s64 %rd26, %rd25, %rd17; - shl.b64 %rd27, %rd26, 2; - add.s64 %rd28, %rd2, %rd27; - ld.global.f32 %f1, [%rd28]; - shl.b64 %rd29, %rd17, 2; - add.s64 %rd30, %rd3, %rd29; - ld.global.f32 %f2, [%rd30]; - add.rn.f32 %f3, %f1, %f2; - add.s64 %rd31, %rd1, %rd27; - st.global.f32 [%rd31], %f3; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd28, %rd16, %rd15; + setp.gt.s64 %p1, %rd28, 19; + @%p1 bra $L__BB0_3; + ld.param.u64 %rd12, [Unknown2_param_13]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown2_param_1]; + ld.param.u64 %rd14, [Unknown2_param_6]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd27, %rd28, 2; + shl.b64 %rd7, %rd5, 2; $L__BB0_2: - ret; - -} - // .globl Unknown1 -.visible .entry Unknown1( - .param .u64 Unknown1_param_0, - .param .u64 Unknown1_param_1, - .param .u64 Unknown1_param_2, - .param .u64 Unknown1_param_3, - .param .u64 Unknown1_param_4, - .param .u64 Unknown1_param_5, - .param .u64 Unknown1_param_6, - .param .u64 Unknown1_param_7, - .param .u64 Unknown1_param_8, - .param .u64 Unknown1_param_9, - .param .u64 Unknown1_param_10, - .param .u64 Unknown1_param_11, - .param .u64 Unknown1_param_12, - .param .u64 Unknown1_param_13, - .param .u64 Unknown1_param_14, - .param .u64 Unknown1_param_15, - .param .u64 Unknown1_param_16, - .param .u64 Unknown1_param_17, - .param .u64 Unknown1_param_18 -) -{ - .reg .pred %p<3>; - .reg .b32 %r<4>; - .reg .f32 %f<5>; - .reg .b64 %rd<32>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 39; - @%p1 bra $L__BB1_2; - ld.param.u64 %rd5, [Unknown1_param_13]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown1_param_1]; - ld.param.u64 %rd7, [Unknown1_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 7378697629483820647; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 3; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 20; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 20; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 7378697629483820647; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 3; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.lo.s64 %rd25, %rd24, 20; - add.s64 %rd26, %rd25, %rd17; - shl.b64 %rd27, %rd26, 2; - add.s64 %rd28, %rd2, %rd27; - ld.global.f32 %f1, [%rd28]; - shl.b64 %rd29, %rd17, 2; - add.s64 %rd30, %rd3, %rd29; - ld.global.f32 %f2, [%rd30]; + mul.hi.s64 %rd17, %rd28, 7378697629483820647; + shr.u64 %rd18, %rd17, 63; + shr.u64 %rd19, %rd17, 2; + add.s64 %rd20, %rd19, %rd18; + mul.lo.s64 %rd21, %rd20, 10; + sub.s64 %rd22, %rd28, %rd21; + shl.b64 %rd23, %rd22, 2; + add.s64 %rd24, %rd3, %rd23; + ld.global.nc.f32 %f1, [%rd24]; + add.s64 %rd25, %rd2, %rd27; + ld.global.nc.f32 %f2, [%rd25]; add.rn.f32 %f3, %f1, %f2; - max.f32 %f4, %f3, 0f00000000; - add.s64 %rd31, %rd1, %rd27; - st.global.f32 [%rd31], %f4; -$L__BB1_2: + add.s64 %rd26, %rd1, %rd27; + st.global.f32 [%rd26], %f3; + add.s64 %rd28, %rd28, %rd5; + add.s64 %rd27, %rd27, %rd7; + setp.lt.s64 %p2, %rd28, 20; + @%p2 bra $L__BB0_2; +$L__BB0_3: ret; } @@ -178,53 +100,49 @@ $L__BB1_2: ) { .reg .pred %p<3>; - .reg .b32 %r<4>; + .reg .b32 %r<5>; .reg .f32 %f<5>; - .reg .b64 %rd<32>; + .reg .b64 %rd<29>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 39; - @%p1 bra $L__BB2_2; - ld.param.u64 %rd5, [Unknown0_param_13]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown0_param_1]; - ld.param.u64 %rd7, [Unknown0_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 7378697629483820647; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 3; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 20; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 20; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 7378697629483820647; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 3; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.lo.s64 %rd25, %rd24, 20; - add.s64 %rd26, %rd25, %rd17; - shl.b64 %rd27, %rd26, 2; - add.s64 %rd28, %rd2, %rd27; - ld.global.f32 %f1, [%rd28]; - shl.b64 %rd29, %rd17, 2; - add.s64 %rd30, %rd3, %rd29; - ld.global.f32 %f2, [%rd30]; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd28, %rd16, %rd15; + setp.gt.s64 %p1, %rd28, 39; + @%p1 bra $L__BB1_3; + ld.param.u64 %rd12, [Unknown0_param_13]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown0_param_1]; + ld.param.u64 %rd14, [Unknown0_param_6]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd27, %rd28, 2; + shl.b64 %rd7, %rd5, 2; +$L__BB1_2: + mul.hi.s64 %rd17, %rd28, 7378697629483820647; + shr.u64 %rd18, %rd17, 63; + shr.s64 %rd19, %rd17, 3; + add.s64 %rd20, %rd19, %rd18; + mul.lo.s64 %rd21, %rd20, 20; + sub.s64 %rd22, %rd28, %rd21; + shl.b64 %rd23, %rd22, 2; + add.s64 %rd24, %rd3, %rd23; + ld.global.nc.f32 %f1, [%rd24]; + add.s64 %rd25, %rd2, %rd27; + ld.global.nc.f32 %f2, [%rd25]; add.rn.f32 %f3, %f1, %f2; - max.f32 %f4, %f3, 0f00000000; - add.s64 %rd31, %rd1, %rd27; - st.global.f32 [%rd31], %f4; -$L__BB2_2: + max.NaN.f32 %f4, %f3, 0f00000000; + add.s64 %rd26, %rd1, %rd27; + st.global.f32 [%rd26], %f4; + add.s64 %rd28, %rd28, %rd5; + add.s64 %rd27, %rd27, %rd7; + setp.lt.s64 %p2, %rd28, 40; + @%p2 bra $L__BB1_2; +$L__BB1_3: ret; } diff --git a/compiler/test/E2E/MLPInference/host_output.mlir b/compiler/test/E2E/MLPInference/host_output.mlir index 70f74be53..76b0c8720 100644 --- a/compiler/test/E2E/MLPInference/host_output.mlir +++ b/compiler/test/E2E/MLPInference/host_output.mlir @@ -3,29 +3,17 @@ // CHECK-LABEL: func.func @forward module attributes {byre.container_module, gpu.container_module, torch.debug_module_name = "GraphModule"} { - func.func @forward(%arg0: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} { - %alloc = memref.alloc() : memref<320xi8, "cuda"> - %alloc_0 = memref.alloc() : memref<20xf32, "cuda"> - byre.compute @FillOp(%alloc_0) {device = "cuda", memory_effects = [2 : i32], value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>} : memref<20xf32, "cuda"> - %alloc_1 = memref.alloc() : memref<20xf32, "cuda"> - byre.compute @FillOp(%alloc_1) {device = "cuda", memory_effects = [2 : i32], value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>} : memref<20xf32, "cuda"> - %alloc_2 = memref.alloc() : memref<10xf32, "cuda"> - byre.compute @FillOp(%alloc_2) {device = "cuda", memory_effects = [2 : i32], value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>} : memref<10xf32, "cuda"> - %alloc_3 = memref.alloc() : memref<20x10xf32, "cuda"> - byre.compute @FillOp(%alloc_3) {device = "cuda", memory_effects = [2 : i32], value = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32>} : memref<20x10xf32, "cuda"> - %alloc_4 = memref.alloc() : memref<20x20xf32, "cuda"> - byre.compute @FillOp(%alloc_4) {device = "cuda", memory_effects = [2 : i32], value = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32>} : memref<20x20xf32, "cuda"> - %alloc_5 = memref.alloc() : memref<10x20xf32, "cuda"> - byre.compute @FillOp(%alloc_5) {device = "cuda", memory_effects = [2 : i32], value = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32>} : memref<10x20xf32, "cuda"> - %0 = "byre.alias"(%alloc) {device = "cuda", offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%arg0, %alloc_5, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x20xf32, "cuda"> - %1 = "byre.alias"(%alloc) {device = "cuda", offset = 160 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda"> - byre.compute @PTXOp(%alloc_0, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%1, %alloc_4, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda"> - byre.compute @PTXOp(%alloc_1, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown1", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> - %2 = "byre.alias"(%alloc) {device = "cuda", offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x10xf32, "cuda"> - byre.compute @MatmulOp_f32f32_f32(%1, %alloc_3, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x10xf32, "cuda"> - byre.compute @PTXOp(%alloc_2, %2, %arg1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda"> + func.func @forward(%arg0: memref<20x10xf32, "cuda"> {byre.argname = "Weight0", byre.argtype = 4 : i32, byre.weight_value = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32>}, %arg1: memref<20xf32, "cuda"> {byre.argname = "Weight1", byre.argtype = 4 : i32, byre.weight_value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>}, %arg2: memref<20x20xf32, "cuda"> {byre.argname = "Weight2", byre.argtype = 4 : i32, byre.weight_value = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32>}, %arg3: memref<20xf32, "cuda"> {byre.argname = "Weight3", byre.argtype = 4 : i32, byre.weight_value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>}, %arg4: memref<10x20xf32, "cuda"> {byre.argname = "Weight4", byre.argtype = 4 : i32, byre.weight_value = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32>}, %arg5: memref<10xf32, "cuda"> {byre.argname = "Weight5", byre.argtype = 4 : i32, byre.weight_value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>}, %arg6: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg7: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} { + %alloc = memref.alloc() : memref<512xi8, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 0 : i64}> {device = "cuda"} : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%arg6, %arg0, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x20xf32, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 256 : i64}> {device = "cuda"} : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda"> + byre.compute @PTXOp(%arg1, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%1, %arg2, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda"> + byre.compute @PTXOp(%arg3, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 0 : i64}> {device = "cuda"} : (memref<512xi8, "cuda">) -> memref<2x10xf32, "cuda"> + byre.compute @MatmulOp_f32f32_f32(%1, %arg4, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x10xf32, "cuda"> + byre.compute @PTXOp(%arg5, %2, %arg7) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda"> return } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/10b_ptx_codegen.mlir b/compiler/test/E2E/ResNet18/BW/10b_ptx_codegen.mlir index fe79d2c0d..31ce4b352 100644 --- a/compiler/test/E2E/ResNet18/BW/10b_ptx_codegen.mlir +++ b/compiler/test/E2E/ResNet18/BW/10b_ptx_codegen.mlir @@ -4,7 +4,7 @@ module attributes {byre.container_module, gpu.container_module} { gpu.module @unified { - llvm.func @Unknown99(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown96(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -22,207 +22,56 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(2359296 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(512 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(4608 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown98(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(2359296 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(512 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(4608 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown97(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(131072 : index) : i64 - %19 = llvm.mlir.constant(256 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = nvvm.read.ptx.sreg.ctaid.x : i32 + %17 = llvm.mlir.constant(131072 : index) : i64 + %18 = llvm.mlir.constant(0 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 %22 = llvm.sext %21 : i32 to i64 - %23 = nvvm.read.ptx.sreg.ntid.x : i32 + %23 = nvvm.read.ptx.sreg.tid.x : i32 %24 = llvm.sext %23 : i32 to i64 - %25 = nvvm.read.ptx.sreg.tid.x : i32 - %26 = llvm.sext %25 : i32 to i64 - %27 = llvm.mul %24, %22 : i64 - %28 = llvm.add %26, %27 : i64 - %29 = llvm.icmp "slt" %28, %18 : i64 - llvm.cond_br %29, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %30 = llvm.srem %28, %19 : i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 %31 = llvm.icmp "slt" %30, %17 : i64 - %32 = llvm.add %30, %19 : i64 - %33 = llvm.select %31, %32, %30 : i1, i64 - %34 = llvm.icmp "slt" %28, %17 : i64 - %35 = llvm.sub %20, %28 : i64 - %36 = llvm.select %34, %35, %28 : i1, i64 - %37 = llvm.sdiv %36, %19 : i64 - %38 = llvm.sub %20, %37 : i64 - %39 = llvm.select %34, %38, %37 : i1, i64 - %40 = llvm.mul %39, %19 : i64 - %41 = llvm.add %40, %33 : i64 - %42 = llvm.add %41, %17 : i64 - %43 = llvm.add %42, %17 : i64 - %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %45 = llvm.load %44 : !llvm.ptr -> f16 - %46 = llvm.fpext %45 : f16 to f32 - %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %46, %47 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(256 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %42 = llvm.mul %18, %36 : i64 + %43 = llvm.add %42, %18 : i64 + %44 = llvm.add %43, %18 : i64 + %45 = llvm.add %44, %18 : i64 + %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %47 = llvm.load %46 : !llvm.ptr -> f16 + %48 = llvm.fpext %47 : f16 to f32 + %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %48, %56 : f32, !llvm.ptr + %57 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%57 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown96(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown95(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -242,68 +91,58 @@ module attributes {byre.container_module, gpu.container_module} { %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 %18 = llvm.mlir.constant(2359296 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(512 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(4608 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(4608 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.fpext %51 : f16 to f32 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %52, %60 : f32, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown95(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown94(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -323,230 +162,58 @@ module attributes {byre.container_module, gpu.container_module} { %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 %18 = llvm.mlir.constant(1179648 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(256 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(2304 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown94(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(589824 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(256 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(2304 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown93(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(589824 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(256 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(2304 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(2304 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.fpext %51 : f16 to f32 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %52, %60 : f32, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown92(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown91(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -564,45 +231,56 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(32768 : index) : i64 - %19 = llvm.mlir.constant(128 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = nvvm.read.ptx.sreg.ctaid.x : i32 + %17 = llvm.mlir.constant(32768 : index) : i64 + %18 = llvm.mlir.constant(0 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 %22 = llvm.sext %21 : i32 to i64 - %23 = nvvm.read.ptx.sreg.ntid.x : i32 + %23 = nvvm.read.ptx.sreg.tid.x : i32 %24 = llvm.sext %23 : i32 to i64 - %25 = nvvm.read.ptx.sreg.tid.x : i32 - %26 = llvm.sext %25 : i32 to i64 - %27 = llvm.mul %24, %22 : i64 - %28 = llvm.add %26, %27 : i64 - %29 = llvm.icmp "slt" %28, %18 : i64 - llvm.cond_br %29, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %30 = llvm.srem %28, %19 : i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 %31 = llvm.icmp "slt" %30, %17 : i64 - %32 = llvm.add %30, %19 : i64 - %33 = llvm.select %31, %32, %30 : i1, i64 - %34 = llvm.icmp "slt" %28, %17 : i64 - %35 = llvm.sub %20, %28 : i64 - %36 = llvm.select %34, %35, %28 : i1, i64 - %37 = llvm.sdiv %36, %19 : i64 - %38 = llvm.sub %20, %37 : i64 - %39 = llvm.select %34, %38, %37 : i1, i64 - %40 = llvm.mul %39, %19 : i64 - %41 = llvm.add %40, %33 : i64 - %42 = llvm.add %41, %17 : i64 - %43 = llvm.add %42, %17 : i64 - %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %45 = llvm.load %44 : !llvm.ptr -> f16 - %46 = llvm.fpext %45 : f16 to f32 - %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %46, %47 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(128 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %42 = llvm.mul %18, %36 : i64 + %43 = llvm.add %42, %18 : i64 + %44 = llvm.add %43, %18 : i64 + %45 = llvm.add %44, %18 : i64 + %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %47 = llvm.load %46 : !llvm.ptr -> f16 + %48 = llvm.fpext %47 : f16 to f32 + %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %48, %56 : f32, !llvm.ptr + %57 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%57 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown91(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown90(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -622,68 +300,58 @@ module attributes {byre.container_module, gpu.container_module} { %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 %18 = llvm.mlir.constant(589824 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(256 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(2304 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(2304 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.fpext %51 : f16 to f32 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %52, %60 : f32, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown90(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown89(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -703,691 +371,58 @@ module attributes {byre.container_module, gpu.container_module} { %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 %18 = llvm.mlir.constant(294912 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(128 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(1152 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown89(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(147456 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(128 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(1152 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown88(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(147456 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(128 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(1152 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown87(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(8192 : index) : i64 - %19 = llvm.mlir.constant(64 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = nvvm.read.ptx.sreg.ctaid.x : i32 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 %22 = llvm.sext %21 : i32 to i64 - %23 = nvvm.read.ptx.sreg.ntid.x : i32 + %23 = nvvm.read.ptx.sreg.tid.x : i32 %24 = llvm.sext %23 : i32 to i64 - %25 = nvvm.read.ptx.sreg.tid.x : i32 - %26 = llvm.sext %25 : i32 to i64 - %27 = llvm.mul %24, %22 : i64 - %28 = llvm.add %26, %27 : i64 - %29 = llvm.icmp "slt" %28, %18 : i64 - llvm.cond_br %29, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %30 = llvm.srem %28, %19 : i64 - %31 = llvm.icmp "slt" %30, %17 : i64 - %32 = llvm.add %30, %19 : i64 - %33 = llvm.select %31, %32, %30 : i1, i64 - %34 = llvm.icmp "slt" %28, %17 : i64 - %35 = llvm.sub %20, %28 : i64 - %36 = llvm.select %34, %35, %28 : i1, i64 - %37 = llvm.sdiv %36, %19 : i64 - %38 = llvm.sub %20, %37 : i64 - %39 = llvm.select %34, %38, %37 : i1, i64 - %40 = llvm.mul %39, %19 : i64 - %41 = llvm.add %40, %33 : i64 - %42 = llvm.add %41, %17 : i64 - %43 = llvm.add %42, %17 : i64 - %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %45 = llvm.load %44 : !llvm.ptr -> f16 - %46 = llvm.fpext %45 : f16 to f32 - %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %46, %47 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown86(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(147456 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(128 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(1152 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown85(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(73728 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(64 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(576 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown84(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(36864 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(64 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(576 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown83(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(36864 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(64 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(576 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown82(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(36864 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(64 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(576 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(1152 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.fpext %51 : f16 to f32 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %52, %60 : f32, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown81(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown86(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -1402,422 +437,59 @@ module attributes {byre.container_module, gpu.container_module} { %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(36864 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(64 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(576 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown80(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %9 = llvm.mlir.constant(0 : index) : i64 - %10 = llvm.mlir.constant(512000 : index) : i64 - %11 = llvm.mlir.constant(512 : index) : i64 - %12 = llvm.mlir.constant(-1 : index) : i64 - %13 = nvvm.read.ptx.sreg.ctaid.x : i32 - %14 = llvm.sext %13 : i32 to i64 - %15 = nvvm.read.ptx.sreg.ntid.x : i32 - %16 = llvm.sext %15 : i32 to i64 - %17 = nvvm.read.ptx.sreg.tid.x : i32 - %18 = llvm.sext %17 : i32 to i64 - %19 = llvm.mul %16, %14 : i64 - %20 = llvm.add %18, %19 : i64 - %21 = llvm.icmp "slt" %20, %10 : i64 - llvm.cond_br %21, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %22 = llvm.srem %20, %11 : i64 - %23 = llvm.icmp "slt" %22, %9 : i64 - %24 = llvm.add %22, %11 : i64 - %25 = llvm.select %23, %24, %22 : i1, i64 - %26 = llvm.icmp "slt" %20, %9 : i64 - %27 = llvm.sub %12, %20 : i64 - %28 = llvm.select %26, %27, %20 : i1, i64 - %29 = llvm.sdiv %28, %11 : i64 - %30 = llvm.sub %12, %29 : i64 - %31 = llvm.select %26, %30, %29 : i1, i64 - %32 = llvm.mul %31, %11 : i64 - %33 = llvm.add %32, %25 : i64 - %34 = llvm.getelementptr %arg1[%33] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %35 = llvm.load %34 : !llvm.ptr -> f16 - %36 = llvm.fpext %35 : f16 to f32 - %37 = llvm.getelementptr %arg8[%33] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %36, %37 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown79(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.mlir.constant(1000 : index) : i64 - %6 = nvvm.read.ptx.sreg.ctaid.x : i32 - %7 = llvm.sext %6 : i32 to i64 - %8 = nvvm.read.ptx.sreg.ntid.x : i32 - %9 = llvm.sext %8 : i32 to i64 - %10 = nvvm.read.ptx.sreg.tid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = llvm.mul %9, %7 : i64 - %13 = llvm.add %11, %12 : i64 - %14 = llvm.icmp "slt" %13, %5 : i64 - llvm.cond_br %14, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %15 = llvm.getelementptr %arg1[%13] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %16 = llvm.load %15 : !llvm.ptr -> f32 - %17 = llvm.fptrunc %16 : f32 to f16 - %18 = llvm.fpext %17 : f16 to f32 - %19 = llvm.getelementptr %arg6[%13] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %18, %19 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown78(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %9 = llvm.mlir.constant(0 : index) : i64 - %10 = llvm.mlir.constant(1000 : index) : i64 - %11 = nvvm.read.ptx.sreg.ctaid.x : i32 - %12 = llvm.sext %11 : i32 to i64 - %13 = nvvm.read.ptx.sreg.ntid.x : i32 - %14 = llvm.sext %13 : i32 to i64 - %15 = nvvm.read.ptx.sreg.tid.x : i32 - %16 = llvm.sext %15 : i32 to i64 - %17 = llvm.mul %14, %12 : i64 - %18 = llvm.add %16, %17 : i64 - %19 = llvm.icmp "slt" %18, %10 : i64 - llvm.cond_br %19, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %20 = llvm.mul %9, %10 : i64 - %21 = llvm.add %20, %18 : i64 - %22 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %23 = llvm.load %22 : !llvm.ptr -> f16 - %24 = llvm.fpext %23 : f16 to f32 - %25 = llvm.getelementptr %arg8[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %24, %25 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown77(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(9408 : index) : i64 - %19 = llvm.mlir.constant(7 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(3 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(147 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(49 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fpext %70 : f16 to f32 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %71, %72 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown74(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(802816 : index) : i64 - %28 = llvm.mlir.constant(112 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(12544 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fcmp "ogt" %67, %25 : f16 - %71 = llvm.select %70, %69, %25 : i1, f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown73(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0 : index) : i64 - %26 = llvm.mlir.constant(200704 : index) : i64 - %27 = llvm.mlir.constant(56 : index) : i64 - %28 = llvm.mlir.constant(-1 : index) : i64 - %29 = nvvm.read.ptx.sreg.ctaid.x : i32 - %30 = llvm.sext %29 : i32 to i64 - %31 = nvvm.read.ptx.sreg.ntid.x : i32 - %32 = llvm.sext %31 : i32 to i64 - %33 = nvvm.read.ptx.sreg.tid.x : i32 - %34 = llvm.sext %33 : i32 to i64 - %35 = llvm.mul %32, %30 : i64 - %36 = llvm.add %34, %35 : i64 - %37 = llvm.icmp "slt" %36, %26 : i64 - llvm.cond_br %37, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %38 = llvm.srem %36, %27 : i64 - %39 = llvm.icmp "slt" %38, %25 : i64 - %40 = llvm.add %38, %27 : i64 - %41 = llvm.select %39, %40, %38 : i1, i64 - %42 = llvm.icmp "slt" %36, %25 : i64 - %43 = llvm.sub %28, %36 : i64 - %44 = llvm.select %42, %43, %36 : i1, i64 - %45 = llvm.sdiv %44, %27 : i64 - %46 = llvm.sub %28, %45 : i64 - %47 = llvm.select %42, %46, %45 : i1, i64 - %48 = llvm.srem %47, %27 : i64 - %49 = llvm.icmp "slt" %48, %25 : i64 - %50 = llvm.add %48, %27 : i64 - %51 = llvm.select %49, %50, %48 : i1, i64 - %52 = llvm.icmp "slt" %47, %25 : i64 - %53 = llvm.sub %28, %47 : i64 - %54 = llvm.select %52, %53, %47 : i1, i64 - %55 = llvm.sdiv %54, %27 : i64 - %56 = llvm.sub %28, %55 : i64 - %57 = llvm.select %52, %56, %55 : i1, i64 - %58 = llvm.mul %25, %26 : i64 - %59 = llvm.mlir.constant(3136 : index) : i64 - %60 = llvm.mul %57, %59 : i64 - %61 = llvm.add %58, %60 : i64 - %62 = llvm.mul %51, %27 : i64 - %63 = llvm.add %61, %62 : i64 - %64 = llvm.add %63, %41 : i64 - %65 = llvm.getelementptr %arg1[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %66 = llvm.load %65 : !llvm.ptr -> f16 - %67 = llvm.getelementptr %arg12[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %68 = llvm.load %67 : !llvm.ptr -> f16 - %69 = llvm.fadd %66, %68 : f16 - %70 = llvm.getelementptr %arg23[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %69, %70 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %17 = llvm.mlir.constant(8192 : index) : i64 + %18 = llvm.mlir.constant(0 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %17 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(64 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %42 = llvm.mul %18, %36 : i64 + %43 = llvm.add %42, %18 : i64 + %44 = llvm.add %43, %18 : i64 + %45 = llvm.add %44, %18 : i64 + %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %47 = llvm.load %46 : !llvm.ptr -> f16 + %48 = llvm.fpext %47 : f16 to f32 + %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %48, %56 : f32, !llvm.ptr + %57 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%57 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown69(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown85(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -1835,70 +507,60 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(200704 : index) : i64 - %28 = llvm.mlir.constant(56 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(3136 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fcmp "ogt" %67, %25 : f16 - %71 = llvm.select %70, %69, %25 : i1, f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %17 = llvm.mlir.constant(0 : index) : i64 + %18 = llvm.mlir.constant(147456 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(1152 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.fpext %51 : f16 to f32 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %52, %60 : f32, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown65(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown84(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -1916,81 +578,60 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.insertvalue %arg33, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %26 = llvm.insertvalue %arg34, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %27 = llvm.insertvalue %arg35, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %28 = llvm.insertvalue %arg36, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %29 = llvm.insertvalue %arg40, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %34 = llvm.mlir.constant(0 : index) : i64 - %35 = llvm.mlir.constant(200704 : index) : i64 - %36 = llvm.mlir.constant(56 : index) : i64 - %37 = llvm.mlir.constant(-1 : index) : i64 - %38 = nvvm.read.ptx.sreg.ctaid.x : i32 - %39 = llvm.sext %38 : i32 to i64 - %40 = nvvm.read.ptx.sreg.ntid.x : i32 - %41 = llvm.sext %40 : i32 to i64 - %42 = nvvm.read.ptx.sreg.tid.x : i32 - %43 = llvm.sext %42 : i32 to i64 - %44 = llvm.mul %41, %39 : i64 - %45 = llvm.add %43, %44 : i64 - %46 = llvm.icmp "slt" %45, %35 : i64 - llvm.cond_br %46, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %47 = llvm.srem %45, %36 : i64 - %48 = llvm.icmp "slt" %47, %34 : i64 - %49 = llvm.add %47, %36 : i64 - %50 = llvm.select %48, %49, %47 : i1, i64 - %51 = llvm.icmp "slt" %45, %34 : i64 - %52 = llvm.sub %37, %45 : i64 - %53 = llvm.select %51, %52, %45 : i1, i64 - %54 = llvm.sdiv %53, %36 : i64 - %55 = llvm.sub %37, %54 : i64 - %56 = llvm.select %51, %55, %54 : i1, i64 - %57 = llvm.srem %56, %36 : i64 - %58 = llvm.icmp "slt" %57, %34 : i64 - %59 = llvm.add %57, %36 : i64 - %60 = llvm.select %58, %59, %57 : i1, i64 - %61 = llvm.icmp "slt" %56, %34 : i64 - %62 = llvm.sub %37, %56 : i64 - %63 = llvm.select %61, %62, %56 : i1, i64 - %64 = llvm.sdiv %63, %36 : i64 - %65 = llvm.sub %37, %64 : i64 - %66 = llvm.select %61, %65, %64 : i1, i64 - %67 = llvm.mul %34, %35 : i64 - %68 = llvm.mlir.constant(3136 : index) : i64 - %69 = llvm.mul %66, %68 : i64 - %70 = llvm.add %67, %69 : i64 - %71 = llvm.mul %60, %36 : i64 - %72 = llvm.add %70, %71 : i64 - %73 = llvm.add %72, %50 : i64 - %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %75 = llvm.load %74 : !llvm.ptr -> f16 - %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %77 = llvm.load %76 : !llvm.ptr -> f16 - %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %79 = llvm.load %78 : !llvm.ptr -> f16 - %80 = llvm.fadd %77, %79 : f16 - %81 = llvm.fcmp "ogt" %75, %33 : f16 - %82 = llvm.select %81, %80, %33 : i1, f16 - %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %82, %83 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %17 = llvm.mlir.constant(0 : index) : i64 + %18 = llvm.mlir.constant(73728 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(576 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.fpext %51 : f16 to f32 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %52, %60 : f32, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown61(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown80(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2008,70 +649,159 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(200704 : index) : i64 - %28 = llvm.mlir.constant(56 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(3136 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fcmp "ogt" %67, %25 : f16 - %71 = llvm.select %70, %69, %25 : i1, f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %17 = llvm.mlir.constant(0 : index) : i64 + %18 = llvm.mlir.constant(36864 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(576 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.fpext %51 : f16 to f32 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %52, %60 : f32, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown57(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown79(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %9 = llvm.mlir.constant(0 : index) : i64 + %10 = llvm.mlir.constant(512000 : index) : i64 + %11 = nvvm.read.ptx.sreg.ctaid.x : i32 + %12 = llvm.sext %11 : i32 to i64 + %13 = nvvm.read.ptx.sreg.ntid.x : i32 + %14 = llvm.sext %13 : i32 to i64 + %15 = nvvm.read.ptx.sreg.tid.x : i32 + %16 = llvm.sext %15 : i32 to i64 + %17 = llvm.mul %14, %12 : i64 + %18 = llvm.add %16, %17 : i64 + %19 = nvvm.read.ptx.sreg.nctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = llvm.mul %14, %20 : i64 + llvm.br ^bb1(%18 : i64) + ^bb1(%22: i64): // 2 preds: ^bb0, ^bb2 + %23 = llvm.icmp "slt" %22, %10 : i64 + llvm.cond_br %23, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %25 = llvm.insertvalue %22, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %26 = llvm.mlir.constant(1 : index) : i64 + %27 = llvm.insertvalue %26, %25[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %28 = llvm.mlir.constant(512 : index) : i64 + %29 = llvm.getelementptr %arg1[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %30 = llvm.mul %9, %28 : i64 + %31 = llvm.add %30, %9 : i64 + %32 = llvm.getelementptr %29[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %33 = llvm.load %32 : !llvm.ptr -> f16 + %34 = llvm.fpext %33 : f16 to f32 + %35 = llvm.insertvalue %22, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %36 = llvm.insertvalue %26, %35[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %37 = llvm.getelementptr %arg8[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %38 = llvm.getelementptr %37[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %34, %38 : f32, !llvm.ptr + %39 = llvm.add %22, %21 : i64 + llvm.br ^bb1(%39 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown78(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %9 = llvm.mlir.constant(0 : index) : i64 + %10 = llvm.mlir.constant(1000 : index) : i64 + %11 = nvvm.read.ptx.sreg.ctaid.x : i32 + %12 = llvm.sext %11 : i32 to i64 + %13 = nvvm.read.ptx.sreg.ntid.x : i32 + %14 = llvm.sext %13 : i32 to i64 + %15 = nvvm.read.ptx.sreg.tid.x : i32 + %16 = llvm.sext %15 : i32 to i64 + %17 = llvm.mul %14, %12 : i64 + %18 = llvm.add %16, %17 : i64 + %19 = nvvm.read.ptx.sreg.nctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = llvm.mul %14, %20 : i64 + llvm.br ^bb1(%18 : i64) + ^bb1(%22: i64): // 2 preds: ^bb0, ^bb2 + %23 = llvm.icmp "slt" %22, %10 : i64 + llvm.cond_br %23, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %25 = llvm.insertvalue %22, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %26 = llvm.mlir.constant(1 : index) : i64 + %27 = llvm.insertvalue %26, %25[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %28 = llvm.getelementptr %arg1[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %29 = llvm.mul %9, %10 : i64 + %30 = llvm.add %29, %9 : i64 + %31 = llvm.getelementptr %28[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %32 = llvm.load %31 : !llvm.ptr -> f16 + %33 = llvm.fpext %32 : f16 to f32 + %34 = llvm.fptrunc %33 : f32 to f16 + %35 = llvm.fpext %34 : f16 to f32 + %36 = llvm.insertvalue %22, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %37 = llvm.insertvalue %26, %36[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %38 = llvm.getelementptr %arg8[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %39 = llvm.getelementptr %38[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %35, %39 : f32, !llvm.ptr + %40 = llvm.add %22, %21 : i64 + llvm.br ^bb1(%40 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown77(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2089,81 +819,60 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.insertvalue %arg33, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %26 = llvm.insertvalue %arg34, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %27 = llvm.insertvalue %arg35, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %28 = llvm.insertvalue %arg36, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %29 = llvm.insertvalue %arg40, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %34 = llvm.mlir.constant(0 : index) : i64 - %35 = llvm.mlir.constant(200704 : index) : i64 - %36 = llvm.mlir.constant(56 : index) : i64 - %37 = llvm.mlir.constant(-1 : index) : i64 - %38 = nvvm.read.ptx.sreg.ctaid.x : i32 - %39 = llvm.sext %38 : i32 to i64 - %40 = nvvm.read.ptx.sreg.ntid.x : i32 - %41 = llvm.sext %40 : i32 to i64 - %42 = nvvm.read.ptx.sreg.tid.x : i32 - %43 = llvm.sext %42 : i32 to i64 - %44 = llvm.mul %41, %39 : i64 - %45 = llvm.add %43, %44 : i64 - %46 = llvm.icmp "slt" %45, %35 : i64 - llvm.cond_br %46, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %47 = llvm.srem %45, %36 : i64 - %48 = llvm.icmp "slt" %47, %34 : i64 - %49 = llvm.add %47, %36 : i64 - %50 = llvm.select %48, %49, %47 : i1, i64 - %51 = llvm.icmp "slt" %45, %34 : i64 - %52 = llvm.sub %37, %45 : i64 - %53 = llvm.select %51, %52, %45 : i1, i64 - %54 = llvm.sdiv %53, %36 : i64 - %55 = llvm.sub %37, %54 : i64 - %56 = llvm.select %51, %55, %54 : i1, i64 - %57 = llvm.srem %56, %36 : i64 - %58 = llvm.icmp "slt" %57, %34 : i64 - %59 = llvm.add %57, %36 : i64 - %60 = llvm.select %58, %59, %57 : i1, i64 - %61 = llvm.icmp "slt" %56, %34 : i64 - %62 = llvm.sub %37, %56 : i64 - %63 = llvm.select %61, %62, %56 : i1, i64 - %64 = llvm.sdiv %63, %36 : i64 - %65 = llvm.sub %37, %64 : i64 - %66 = llvm.select %61, %65, %64 : i1, i64 - %67 = llvm.mul %34, %35 : i64 - %68 = llvm.mlir.constant(3136 : index) : i64 - %69 = llvm.mul %66, %68 : i64 - %70 = llvm.add %67, %69 : i64 - %71 = llvm.mul %60, %36 : i64 - %72 = llvm.add %70, %71 : i64 - %73 = llvm.add %72, %50 : i64 - %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %75 = llvm.load %74 : !llvm.ptr -> f16 - %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %77 = llvm.load %76 : !llvm.ptr -> f16 - %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %79 = llvm.load %78 : !llvm.ptr -> f16 - %80 = llvm.fadd %77, %79 : f16 - %81 = llvm.fcmp "ogt" %75, %33 : f16 - %82 = llvm.select %81, %80, %33 : i1, f16 - %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %82, %83 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %17 = llvm.mlir.constant(0 : index) : i64 + %18 = llvm.mlir.constant(9408 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(147 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(49 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(7 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.fpext %51 : f16 to f32 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %52, %60 : f32, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown50(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown74(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2189,62 +898,70 @@ module attributes {byre.container_module, gpu.container_module} { %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(100352 : index) : i64 - %28 = llvm.mlir.constant(28 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 + %25 = llvm.mlir.constant(802816 : index) : i64 + %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %27 = llvm.mlir.constant(0 : index) : i64 + %28 = nvvm.read.ptx.sreg.ctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = nvvm.read.ptx.sreg.ntid.x : i32 %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 + %32 = nvvm.read.ptx.sreg.tid.x : i32 %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(784 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fcmp "ogt" %67, %25 : f16 - %71 = llvm.select %70, %69, %25 : i1, f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %34 = llvm.mul %31, %29 : i64 + %35 = llvm.add %33, %34 : i64 + %36 = nvvm.read.ptx.sreg.nctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = llvm.mul %31, %37 : i64 + llvm.br ^bb1(%35 : i64) + ^bb1(%39: i64): // 2 preds: ^bb0, ^bb2 + %40 = llvm.icmp "slt" %39, %25 : i64 + llvm.cond_br %40, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %43 = llvm.mlir.constant(1 : index) : i64 + %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.mlir.constant(12544 : index) : i64 + %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.mlir.constant(112 : index) : i64 + %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %52 = llvm.mul %27, %25 : i64 + %53 = llvm.mul %27, %47 : i64 + %54 = llvm.add %52, %53 : i64 + %55 = llvm.mul %27, %50 : i64 + %56 = llvm.add %54, %55 : i64 + %57 = llvm.add %56, %27 : i64 + %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.load %58 : !llvm.ptr -> f16 + %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %68 = llvm.load %67 : !llvm.ptr -> f16 + %69 = llvm.fcmp "ogt" %59, %26 : f16 + %70 = llvm.select %69, %68, %26 : i1, f16 + %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %70, %78 : f16, !llvm.ptr + %79 = llvm.add %39, %38 : i64 + llvm.br ^bb1(%79 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown46(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown73(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2270,73 +987,68 @@ module attributes {byre.container_module, gpu.container_module} { %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.insertvalue %arg33, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %26 = llvm.insertvalue %arg34, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %27 = llvm.insertvalue %arg35, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %28 = llvm.insertvalue %arg36, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %29 = llvm.insertvalue %arg40, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %34 = llvm.mlir.constant(0 : index) : i64 - %35 = llvm.mlir.constant(100352 : index) : i64 - %36 = llvm.mlir.constant(28 : index) : i64 - %37 = llvm.mlir.constant(-1 : index) : i64 - %38 = nvvm.read.ptx.sreg.ctaid.x : i32 - %39 = llvm.sext %38 : i32 to i64 - %40 = nvvm.read.ptx.sreg.ntid.x : i32 - %41 = llvm.sext %40 : i32 to i64 - %42 = nvvm.read.ptx.sreg.tid.x : i32 - %43 = llvm.sext %42 : i32 to i64 - %44 = llvm.mul %41, %39 : i64 - %45 = llvm.add %43, %44 : i64 - %46 = llvm.icmp "slt" %45, %35 : i64 - llvm.cond_br %46, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %47 = llvm.srem %45, %36 : i64 - %48 = llvm.icmp "slt" %47, %34 : i64 - %49 = llvm.add %47, %36 : i64 - %50 = llvm.select %48, %49, %47 : i1, i64 - %51 = llvm.icmp "slt" %45, %34 : i64 - %52 = llvm.sub %37, %45 : i64 - %53 = llvm.select %51, %52, %45 : i1, i64 - %54 = llvm.sdiv %53, %36 : i64 - %55 = llvm.sub %37, %54 : i64 - %56 = llvm.select %51, %55, %54 : i1, i64 - %57 = llvm.srem %56, %36 : i64 - %58 = llvm.icmp "slt" %57, %34 : i64 - %59 = llvm.add %57, %36 : i64 - %60 = llvm.select %58, %59, %57 : i1, i64 - %61 = llvm.icmp "slt" %56, %34 : i64 - %62 = llvm.sub %37, %56 : i64 - %63 = llvm.select %61, %62, %56 : i1, i64 - %64 = llvm.sdiv %63, %36 : i64 - %65 = llvm.sub %37, %64 : i64 - %66 = llvm.select %61, %65, %64 : i1, i64 - %67 = llvm.mul %34, %35 : i64 - %68 = llvm.mlir.constant(784 : index) : i64 - %69 = llvm.mul %66, %68 : i64 - %70 = llvm.add %67, %69 : i64 - %71 = llvm.mul %60, %36 : i64 - %72 = llvm.add %70, %71 : i64 - %73 = llvm.add %72, %50 : i64 - %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %75 = llvm.load %74 : !llvm.ptr -> f16 - %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %77 = llvm.load %76 : !llvm.ptr -> f16 - %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %79 = llvm.load %78 : !llvm.ptr -> f16 - %80 = llvm.fadd %77, %79 : f16 - %81 = llvm.fcmp "ogt" %75, %33 : f16 - %82 = llvm.select %81, %80, %33 : i1, f16 - %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %82, %83 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %25 = llvm.mlir.constant(200704 : index) : i64 + %26 = llvm.mlir.constant(0 : index) : i64 + %27 = nvvm.read.ptx.sreg.ctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = nvvm.read.ptx.sreg.ntid.x : i32 + %30 = llvm.sext %29 : i32 to i64 + %31 = nvvm.read.ptx.sreg.tid.x : i32 + %32 = llvm.sext %31 : i32 to i64 + %33 = llvm.mul %30, %28 : i64 + %34 = llvm.add %32, %33 : i64 + %35 = nvvm.read.ptx.sreg.nctaid.x : i32 + %36 = llvm.sext %35 : i32 to i64 + %37 = llvm.mul %30, %36 : i64 + llvm.br ^bb1(%34 : i64) + ^bb1(%38: i64): // 2 preds: ^bb0, ^bb2 + %39 = llvm.icmp "slt" %38, %25 : i64 + llvm.cond_br %39, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %40 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %41 = llvm.insertvalue %38, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(1 : index) : i64 + %43 = llvm.insertvalue %42, %41[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %44 = llvm.insertvalue %25, %43[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %42, %44[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.mlir.constant(3136 : index) : i64 + %47 = llvm.insertvalue %46, %45[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %48 = llvm.insertvalue %42, %47[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.mlir.constant(56 : index) : i64 + %50 = llvm.getelementptr %arg1[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.mul %26, %25 : i64 + %52 = llvm.mul %26, %46 : i64 + %53 = llvm.add %51, %52 : i64 + %54 = llvm.mul %26, %49 : i64 + %55 = llvm.add %53, %54 : i64 + %56 = llvm.add %55, %26 : i64 + %57 = llvm.getelementptr %50[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %58 = llvm.load %57 : !llvm.ptr -> f16 + %59 = llvm.insertvalue %38, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %60 = llvm.insertvalue %42, %59[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.insertvalue %25, %60[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %42, %61[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %46, %62[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.insertvalue %42, %63[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %65 = llvm.getelementptr %arg12[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %66 = llvm.getelementptr %65[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.load %66 : !llvm.ptr -> f16 + %68 = llvm.fadd %58, %67 : f16 + %69 = llvm.insertvalue %38, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %70 = llvm.insertvalue %42, %69[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %71 = llvm.insertvalue %25, %70[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %42, %71[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %46, %72[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %42, %73[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.getelementptr %arg23[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %76 = llvm.getelementptr %75[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %68, %76 : f16, !llvm.ptr + %77 = llvm.add %38, %37 : i64 + llvm.br ^bb1(%77 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown42(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown61(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2362,62 +1074,70 @@ module attributes {byre.container_module, gpu.container_module} { %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(100352 : index) : i64 - %28 = llvm.mlir.constant(28 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 + %25 = llvm.mlir.constant(200704 : index) : i64 + %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %27 = llvm.mlir.constant(0 : index) : i64 + %28 = nvvm.read.ptx.sreg.ctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = nvvm.read.ptx.sreg.ntid.x : i32 %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 + %32 = nvvm.read.ptx.sreg.tid.x : i32 %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(784 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fcmp "ogt" %67, %25 : f16 - %71 = llvm.select %70, %69, %25 : i1, f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %34 = llvm.mul %31, %29 : i64 + %35 = llvm.add %33, %34 : i64 + %36 = nvvm.read.ptx.sreg.nctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = llvm.mul %31, %37 : i64 + llvm.br ^bb1(%35 : i64) + ^bb1(%39: i64): // 2 preds: ^bb0, ^bb2 + %40 = llvm.icmp "slt" %39, %25 : i64 + llvm.cond_br %40, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %43 = llvm.mlir.constant(1 : index) : i64 + %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.mlir.constant(3136 : index) : i64 + %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.mlir.constant(56 : index) : i64 + %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %52 = llvm.mul %27, %25 : i64 + %53 = llvm.mul %27, %47 : i64 + %54 = llvm.add %52, %53 : i64 + %55 = llvm.mul %27, %50 : i64 + %56 = llvm.add %54, %55 : i64 + %57 = llvm.add %56, %27 : i64 + %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.load %58 : !llvm.ptr -> f16 + %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %68 = llvm.load %67 : !llvm.ptr -> f16 + %69 = llvm.fcmp "ogt" %59, %26 : f16 + %70 = llvm.select %69, %68, %26 : i1, f16 + %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %70, %78 : f16, !llvm.ptr + %79 = llvm.add %39, %38 : i64 + llvm.br ^bb1(%79 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown38(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown57(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr {llvm.noalias}, %arg34: !llvm.ptr {llvm.noalias}, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2451,65 +1171,80 @@ module attributes {byre.container_module, gpu.container_module} { %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %34 = llvm.mlir.constant(0 : index) : i64 - %35 = llvm.mlir.constant(100352 : index) : i64 - %36 = llvm.mlir.constant(28 : index) : i64 - %37 = llvm.mlir.constant(-1 : index) : i64 - %38 = nvvm.read.ptx.sreg.ctaid.x : i32 + %33 = llvm.mlir.constant(200704 : index) : i64 + %34 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %35 = llvm.mlir.constant(0 : index) : i64 + %36 = nvvm.read.ptx.sreg.ctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = nvvm.read.ptx.sreg.ntid.x : i32 %39 = llvm.sext %38 : i32 to i64 - %40 = nvvm.read.ptx.sreg.ntid.x : i32 + %40 = nvvm.read.ptx.sreg.tid.x : i32 %41 = llvm.sext %40 : i32 to i64 - %42 = nvvm.read.ptx.sreg.tid.x : i32 - %43 = llvm.sext %42 : i32 to i64 - %44 = llvm.mul %41, %39 : i64 - %45 = llvm.add %43, %44 : i64 - %46 = llvm.icmp "slt" %45, %35 : i64 - llvm.cond_br %46, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %47 = llvm.srem %45, %36 : i64 - %48 = llvm.icmp "slt" %47, %34 : i64 - %49 = llvm.add %47, %36 : i64 - %50 = llvm.select %48, %49, %47 : i1, i64 - %51 = llvm.icmp "slt" %45, %34 : i64 - %52 = llvm.sub %37, %45 : i64 - %53 = llvm.select %51, %52, %45 : i1, i64 - %54 = llvm.sdiv %53, %36 : i64 - %55 = llvm.sub %37, %54 : i64 - %56 = llvm.select %51, %55, %54 : i1, i64 - %57 = llvm.srem %56, %36 : i64 - %58 = llvm.icmp "slt" %57, %34 : i64 - %59 = llvm.add %57, %36 : i64 - %60 = llvm.select %58, %59, %57 : i1, i64 - %61 = llvm.icmp "slt" %56, %34 : i64 - %62 = llvm.sub %37, %56 : i64 - %63 = llvm.select %61, %62, %56 : i1, i64 - %64 = llvm.sdiv %63, %36 : i64 - %65 = llvm.sub %37, %64 : i64 - %66 = llvm.select %61, %65, %64 : i1, i64 - %67 = llvm.mul %34, %35 : i64 - %68 = llvm.mlir.constant(784 : index) : i64 - %69 = llvm.mul %66, %68 : i64 - %70 = llvm.add %67, %69 : i64 - %71 = llvm.mul %60, %36 : i64 - %72 = llvm.add %70, %71 : i64 - %73 = llvm.add %72, %50 : i64 - %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %75 = llvm.load %74 : !llvm.ptr -> f16 - %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %77 = llvm.load %76 : !llvm.ptr -> f16 - %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %79 = llvm.load %78 : !llvm.ptr -> f16 - %80 = llvm.fadd %77, %79 : f16 - %81 = llvm.fcmp "ogt" %75, %33 : f16 - %82 = llvm.select %81, %80, %33 : i1, f16 - %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %82, %83 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %42 = llvm.mul %39, %37 : i64 + %43 = llvm.add %41, %42 : i64 + %44 = nvvm.read.ptx.sreg.nctaid.x : i32 + %45 = llvm.sext %44 : i32 to i64 + %46 = llvm.mul %39, %45 : i64 + llvm.br ^bb1(%43 : i64) + ^bb1(%47: i64): // 2 preds: ^bb0, ^bb2 + %48 = llvm.icmp "slt" %47, %33 : i64 + llvm.cond_br %48, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %49 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %50 = llvm.insertvalue %47, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.mlir.constant(1 : index) : i64 + %52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %33, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %51, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.mlir.constant(3136 : index) : i64 + %56 = llvm.insertvalue %55, %54[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %51, %56[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.mlir.constant(56 : index) : i64 + %59 = llvm.getelementptr %arg1[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.mul %35, %33 : i64 + %61 = llvm.mul %35, %55 : i64 + %62 = llvm.add %60, %61 : i64 + %63 = llvm.mul %35, %58 : i64 + %64 = llvm.add %62, %63 : i64 + %65 = llvm.add %64, %35 : i64 + %66 = llvm.getelementptr %59[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.load %66 : !llvm.ptr -> f16 + %68 = llvm.insertvalue %47, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %69 = llvm.insertvalue %51, %68[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %70 = llvm.insertvalue %33, %69[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %71 = llvm.insertvalue %51, %70[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %55, %71[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %51, %72[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.getelementptr %arg12[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %75 = llvm.getelementptr %74[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %76 = llvm.load %75 : !llvm.ptr -> f16 + %77 = llvm.insertvalue %47, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %78 = llvm.insertvalue %51, %77[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %79 = llvm.insertvalue %33, %78[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %80 = llvm.insertvalue %51, %79[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %81 = llvm.insertvalue %55, %80[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %82 = llvm.insertvalue %51, %81[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %83 = llvm.getelementptr %arg23[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %84 = llvm.getelementptr %83[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %85 = llvm.load %84 : !llvm.ptr -> f16 + %86 = llvm.fadd %67, %76 : f16 + %87 = llvm.fcmp "ogt" %85, %34 : f16 + %88 = llvm.select %87, %86, %34 : i1, f16 + %89 = llvm.insertvalue %47, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %90 = llvm.insertvalue %51, %89[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %91 = llvm.insertvalue %33, %90[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %92 = llvm.insertvalue %51, %91[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %93 = llvm.insertvalue %55, %92[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %94 = llvm.insertvalue %51, %93[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %95 = llvm.getelementptr %arg34[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %96 = llvm.getelementptr %95[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %88, %96 : f16, !llvm.ptr + %97 = llvm.add %47, %46 : i64 + llvm.br ^bb1(%97 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown31(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown42(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2535,62 +1270,70 @@ module attributes {byre.container_module, gpu.container_module} { %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(50176 : index) : i64 - %28 = llvm.mlir.constant(14 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 + %25 = llvm.mlir.constant(100352 : index) : i64 + %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %27 = llvm.mlir.constant(0 : index) : i64 + %28 = nvvm.read.ptx.sreg.ctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = nvvm.read.ptx.sreg.ntid.x : i32 %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 + %32 = nvvm.read.ptx.sreg.tid.x : i32 %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(196 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fcmp "ogt" %67, %25 : f16 - %71 = llvm.select %70, %69, %25 : i1, f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %34 = llvm.mul %31, %29 : i64 + %35 = llvm.add %33, %34 : i64 + %36 = nvvm.read.ptx.sreg.nctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = llvm.mul %31, %37 : i64 + llvm.br ^bb1(%35 : i64) + ^bb1(%39: i64): // 2 preds: ^bb0, ^bb2 + %40 = llvm.icmp "slt" %39, %25 : i64 + llvm.cond_br %40, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %43 = llvm.mlir.constant(1 : index) : i64 + %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.mlir.constant(784 : index) : i64 + %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.mlir.constant(28 : index) : i64 + %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %52 = llvm.mul %27, %25 : i64 + %53 = llvm.mul %27, %47 : i64 + %54 = llvm.add %52, %53 : i64 + %55 = llvm.mul %27, %50 : i64 + %56 = llvm.add %54, %55 : i64 + %57 = llvm.add %56, %27 : i64 + %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.load %58 : !llvm.ptr -> f16 + %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %68 = llvm.load %67 : !llvm.ptr -> f16 + %69 = llvm.fcmp "ogt" %59, %26 : f16 + %70 = llvm.select %69, %68, %26 : i1, f16 + %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %70, %78 : f16, !llvm.ptr + %79 = llvm.add %39, %38 : i64 + llvm.br ^bb1(%79 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown27(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown38(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr {llvm.noalias}, %arg34: !llvm.ptr {llvm.noalias}, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2624,65 +1367,80 @@ module attributes {byre.container_module, gpu.container_module} { %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %34 = llvm.mlir.constant(0 : index) : i64 - %35 = llvm.mlir.constant(50176 : index) : i64 - %36 = llvm.mlir.constant(14 : index) : i64 - %37 = llvm.mlir.constant(-1 : index) : i64 - %38 = nvvm.read.ptx.sreg.ctaid.x : i32 + %33 = llvm.mlir.constant(100352 : index) : i64 + %34 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %35 = llvm.mlir.constant(0 : index) : i64 + %36 = nvvm.read.ptx.sreg.ctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = nvvm.read.ptx.sreg.ntid.x : i32 %39 = llvm.sext %38 : i32 to i64 - %40 = nvvm.read.ptx.sreg.ntid.x : i32 + %40 = nvvm.read.ptx.sreg.tid.x : i32 %41 = llvm.sext %40 : i32 to i64 - %42 = nvvm.read.ptx.sreg.tid.x : i32 - %43 = llvm.sext %42 : i32 to i64 - %44 = llvm.mul %41, %39 : i64 - %45 = llvm.add %43, %44 : i64 - %46 = llvm.icmp "slt" %45, %35 : i64 - llvm.cond_br %46, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %47 = llvm.srem %45, %36 : i64 - %48 = llvm.icmp "slt" %47, %34 : i64 - %49 = llvm.add %47, %36 : i64 - %50 = llvm.select %48, %49, %47 : i1, i64 - %51 = llvm.icmp "slt" %45, %34 : i64 - %52 = llvm.sub %37, %45 : i64 - %53 = llvm.select %51, %52, %45 : i1, i64 - %54 = llvm.sdiv %53, %36 : i64 - %55 = llvm.sub %37, %54 : i64 - %56 = llvm.select %51, %55, %54 : i1, i64 - %57 = llvm.srem %56, %36 : i64 - %58 = llvm.icmp "slt" %57, %34 : i64 - %59 = llvm.add %57, %36 : i64 - %60 = llvm.select %58, %59, %57 : i1, i64 - %61 = llvm.icmp "slt" %56, %34 : i64 - %62 = llvm.sub %37, %56 : i64 - %63 = llvm.select %61, %62, %56 : i1, i64 - %64 = llvm.sdiv %63, %36 : i64 - %65 = llvm.sub %37, %64 : i64 - %66 = llvm.select %61, %65, %64 : i1, i64 - %67 = llvm.mul %34, %35 : i64 - %68 = llvm.mlir.constant(196 : index) : i64 - %69 = llvm.mul %66, %68 : i64 - %70 = llvm.add %67, %69 : i64 - %71 = llvm.mul %60, %36 : i64 - %72 = llvm.add %70, %71 : i64 - %73 = llvm.add %72, %50 : i64 - %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %75 = llvm.load %74 : !llvm.ptr -> f16 - %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %77 = llvm.load %76 : !llvm.ptr -> f16 - %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %79 = llvm.load %78 : !llvm.ptr -> f16 - %80 = llvm.fadd %77, %79 : f16 - %81 = llvm.fcmp "ogt" %75, %33 : f16 - %82 = llvm.select %81, %80, %33 : i1, f16 - %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %82, %83 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %42 = llvm.mul %39, %37 : i64 + %43 = llvm.add %41, %42 : i64 + %44 = nvvm.read.ptx.sreg.nctaid.x : i32 + %45 = llvm.sext %44 : i32 to i64 + %46 = llvm.mul %39, %45 : i64 + llvm.br ^bb1(%43 : i64) + ^bb1(%47: i64): // 2 preds: ^bb0, ^bb2 + %48 = llvm.icmp "slt" %47, %33 : i64 + llvm.cond_br %48, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %49 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %50 = llvm.insertvalue %47, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.mlir.constant(1 : index) : i64 + %52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %33, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %51, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.mlir.constant(784 : index) : i64 + %56 = llvm.insertvalue %55, %54[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %51, %56[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.mlir.constant(28 : index) : i64 + %59 = llvm.getelementptr %arg1[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.mul %35, %33 : i64 + %61 = llvm.mul %35, %55 : i64 + %62 = llvm.add %60, %61 : i64 + %63 = llvm.mul %35, %58 : i64 + %64 = llvm.add %62, %63 : i64 + %65 = llvm.add %64, %35 : i64 + %66 = llvm.getelementptr %59[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.load %66 : !llvm.ptr -> f16 + %68 = llvm.insertvalue %47, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %69 = llvm.insertvalue %51, %68[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %70 = llvm.insertvalue %33, %69[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %71 = llvm.insertvalue %51, %70[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %55, %71[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %51, %72[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.getelementptr %arg12[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %75 = llvm.getelementptr %74[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %76 = llvm.load %75 : !llvm.ptr -> f16 + %77 = llvm.insertvalue %47, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %78 = llvm.insertvalue %51, %77[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %79 = llvm.insertvalue %33, %78[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %80 = llvm.insertvalue %51, %79[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %81 = llvm.insertvalue %55, %80[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %82 = llvm.insertvalue %51, %81[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %83 = llvm.getelementptr %arg23[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %84 = llvm.getelementptr %83[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %85 = llvm.load %84 : !llvm.ptr -> f16 + %86 = llvm.fadd %67, %76 : f16 + %87 = llvm.fcmp "ogt" %85, %34 : f16 + %88 = llvm.select %87, %86, %34 : i1, f16 + %89 = llvm.insertvalue %47, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %90 = llvm.insertvalue %51, %89[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %91 = llvm.insertvalue %33, %90[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %92 = llvm.insertvalue %51, %91[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %93 = llvm.insertvalue %55, %92[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %94 = llvm.insertvalue %51, %93[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %95 = llvm.getelementptr %arg34[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %96 = llvm.getelementptr %95[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %88, %96 : f16, !llvm.ptr + %97 = llvm.add %47, %46 : i64 + llvm.br ^bb1(%97 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown23(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown23(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2708,62 +1466,70 @@ module attributes {byre.container_module, gpu.container_module} { %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(50176 : index) : i64 - %28 = llvm.mlir.constant(14 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 + %25 = llvm.mlir.constant(50176 : index) : i64 + %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %27 = llvm.mlir.constant(0 : index) : i64 + %28 = nvvm.read.ptx.sreg.ctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = nvvm.read.ptx.sreg.ntid.x : i32 %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 + %32 = nvvm.read.ptx.sreg.tid.x : i32 %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(196 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fcmp "ogt" %67, %25 : f16 - %71 = llvm.select %70, %69, %25 : i1, f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %34 = llvm.mul %31, %29 : i64 + %35 = llvm.add %33, %34 : i64 + %36 = nvvm.read.ptx.sreg.nctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = llvm.mul %31, %37 : i64 + llvm.br ^bb1(%35 : i64) + ^bb1(%39: i64): // 2 preds: ^bb0, ^bb2 + %40 = llvm.icmp "slt" %39, %25 : i64 + llvm.cond_br %40, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %43 = llvm.mlir.constant(1 : index) : i64 + %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.mlir.constant(196 : index) : i64 + %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.mlir.constant(14 : index) : i64 + %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %52 = llvm.mul %27, %25 : i64 + %53 = llvm.mul %27, %47 : i64 + %54 = llvm.add %52, %53 : i64 + %55 = llvm.mul %27, %50 : i64 + %56 = llvm.add %54, %55 : i64 + %57 = llvm.add %56, %27 : i64 + %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.load %58 : !llvm.ptr -> f16 + %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %68 = llvm.load %67 : !llvm.ptr -> f16 + %69 = llvm.fcmp "ogt" %59, %26 : f16 + %70 = llvm.select %69, %68, %26 : i1, f16 + %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %70, %78 : f16, !llvm.ptr + %79 = llvm.add %39, %38 : i64 + llvm.br ^bb1(%79 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown19(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown19(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr {llvm.noalias}, %arg34: !llvm.ptr {llvm.noalias}, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2797,146 +1563,80 @@ module attributes {byre.container_module, gpu.container_module} { %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %34 = llvm.mlir.constant(0 : index) : i64 - %35 = llvm.mlir.constant(50176 : index) : i64 - %36 = llvm.mlir.constant(14 : index) : i64 - %37 = llvm.mlir.constant(-1 : index) : i64 - %38 = nvvm.read.ptx.sreg.ctaid.x : i32 + %33 = llvm.mlir.constant(50176 : index) : i64 + %34 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %35 = llvm.mlir.constant(0 : index) : i64 + %36 = nvvm.read.ptx.sreg.ctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = nvvm.read.ptx.sreg.ntid.x : i32 %39 = llvm.sext %38 : i32 to i64 - %40 = nvvm.read.ptx.sreg.ntid.x : i32 + %40 = nvvm.read.ptx.sreg.tid.x : i32 %41 = llvm.sext %40 : i32 to i64 - %42 = nvvm.read.ptx.sreg.tid.x : i32 - %43 = llvm.sext %42 : i32 to i64 - %44 = llvm.mul %41, %39 : i64 - %45 = llvm.add %43, %44 : i64 - %46 = llvm.icmp "slt" %45, %35 : i64 - llvm.cond_br %46, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %47 = llvm.srem %45, %36 : i64 - %48 = llvm.icmp "slt" %47, %34 : i64 - %49 = llvm.add %47, %36 : i64 - %50 = llvm.select %48, %49, %47 : i1, i64 - %51 = llvm.icmp "slt" %45, %34 : i64 - %52 = llvm.sub %37, %45 : i64 - %53 = llvm.select %51, %52, %45 : i1, i64 - %54 = llvm.sdiv %53, %36 : i64 - %55 = llvm.sub %37, %54 : i64 - %56 = llvm.select %51, %55, %54 : i1, i64 - %57 = llvm.srem %56, %36 : i64 - %58 = llvm.icmp "slt" %57, %34 : i64 - %59 = llvm.add %57, %36 : i64 - %60 = llvm.select %58, %59, %57 : i1, i64 - %61 = llvm.icmp "slt" %56, %34 : i64 - %62 = llvm.sub %37, %56 : i64 - %63 = llvm.select %61, %62, %56 : i1, i64 - %64 = llvm.sdiv %63, %36 : i64 - %65 = llvm.sub %37, %64 : i64 - %66 = llvm.select %61, %65, %64 : i1, i64 - %67 = llvm.mul %34, %35 : i64 - %68 = llvm.mlir.constant(196 : index) : i64 - %69 = llvm.mul %66, %68 : i64 - %70 = llvm.add %67, %69 : i64 - %71 = llvm.mul %60, %36 : i64 - %72 = llvm.add %70, %71 : i64 - %73 = llvm.add %72, %50 : i64 - %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %75 = llvm.load %74 : !llvm.ptr -> f16 - %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %77 = llvm.load %76 : !llvm.ptr -> f16 - %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %79 = llvm.load %78 : !llvm.ptr -> f16 - %80 = llvm.fadd %77, %79 : f16 - %81 = llvm.fcmp "ogt" %75, %33 : f16 - %82 = llvm.select %81, %80, %33 : i1, f16 - %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %82, %83 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown12(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(25088 : index) : i64 - %28 = llvm.mlir.constant(7 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(49 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 + %42 = llvm.mul %39, %37 : i64 + %43 = llvm.add %41, %42 : i64 + %44 = nvvm.read.ptx.sreg.nctaid.x : i32 + %45 = llvm.sext %44 : i32 to i64 + %46 = llvm.mul %39, %45 : i64 + llvm.br ^bb1(%43 : i64) + ^bb1(%47: i64): // 2 preds: ^bb0, ^bb2 + %48 = llvm.icmp "slt" %47, %33 : i64 + llvm.cond_br %48, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %49 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %50 = llvm.insertvalue %47, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.mlir.constant(1 : index) : i64 + %52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %33, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %51, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.mlir.constant(196 : index) : i64 + %56 = llvm.insertvalue %55, %54[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %51, %56[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.mlir.constant(14 : index) : i64 + %59 = llvm.getelementptr %arg1[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.mul %35, %33 : i64 + %61 = llvm.mul %35, %55 : i64 + %62 = llvm.add %60, %61 : i64 + %63 = llvm.mul %35, %58 : i64 %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %65 = llvm.add %64, %35 : i64 + %66 = llvm.getelementptr %59[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fcmp "ogt" %67, %25 : f16 - %71 = llvm.select %70, %69, %25 : i1, f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %68 = llvm.insertvalue %47, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %69 = llvm.insertvalue %51, %68[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %70 = llvm.insertvalue %33, %69[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %71 = llvm.insertvalue %51, %70[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %55, %71[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %51, %72[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.getelementptr %arg12[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %75 = llvm.getelementptr %74[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %76 = llvm.load %75 : !llvm.ptr -> f16 + %77 = llvm.insertvalue %47, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %78 = llvm.insertvalue %51, %77[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %79 = llvm.insertvalue %33, %78[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %80 = llvm.insertvalue %51, %79[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %81 = llvm.insertvalue %55, %80[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %82 = llvm.insertvalue %51, %81[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %83 = llvm.getelementptr %arg23[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %84 = llvm.getelementptr %83[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %85 = llvm.load %84 : !llvm.ptr -> f16 + %86 = llvm.fadd %67, %76 : f16 + %87 = llvm.fcmp "ogt" %85, %34 : f16 + %88 = llvm.select %87, %86, %34 : i1, f16 + %89 = llvm.insertvalue %47, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %90 = llvm.insertvalue %51, %89[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %91 = llvm.insertvalue %33, %90[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %92 = llvm.insertvalue %51, %91[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %93 = llvm.insertvalue %55, %92[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %94 = llvm.insertvalue %51, %93[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %95 = llvm.getelementptr %arg34[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %96 = llvm.getelementptr %95[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %88, %96 : f16, !llvm.ptr + %97 = llvm.add %47, %46 : i64 + llvm.br ^bb1(%97 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown8(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown8(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr {llvm.noalias}, %arg34: !llvm.ptr {llvm.noalias}, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2970,65 +1670,80 @@ module attributes {byre.container_module, gpu.container_module} { %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %34 = llvm.mlir.constant(0 : index) : i64 - %35 = llvm.mlir.constant(25088 : index) : i64 - %36 = llvm.mlir.constant(7 : index) : i64 - %37 = llvm.mlir.constant(-1 : index) : i64 - %38 = nvvm.read.ptx.sreg.ctaid.x : i32 + %33 = llvm.mlir.constant(25088 : index) : i64 + %34 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %35 = llvm.mlir.constant(0 : index) : i64 + %36 = nvvm.read.ptx.sreg.ctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = nvvm.read.ptx.sreg.ntid.x : i32 %39 = llvm.sext %38 : i32 to i64 - %40 = nvvm.read.ptx.sreg.ntid.x : i32 + %40 = nvvm.read.ptx.sreg.tid.x : i32 %41 = llvm.sext %40 : i32 to i64 - %42 = nvvm.read.ptx.sreg.tid.x : i32 - %43 = llvm.sext %42 : i32 to i64 - %44 = llvm.mul %41, %39 : i64 - %45 = llvm.add %43, %44 : i64 - %46 = llvm.icmp "slt" %45, %35 : i64 - llvm.cond_br %46, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %47 = llvm.srem %45, %36 : i64 - %48 = llvm.icmp "slt" %47, %34 : i64 - %49 = llvm.add %47, %36 : i64 - %50 = llvm.select %48, %49, %47 : i1, i64 - %51 = llvm.icmp "slt" %45, %34 : i64 - %52 = llvm.sub %37, %45 : i64 - %53 = llvm.select %51, %52, %45 : i1, i64 - %54 = llvm.sdiv %53, %36 : i64 - %55 = llvm.sub %37, %54 : i64 - %56 = llvm.select %51, %55, %54 : i1, i64 - %57 = llvm.srem %56, %36 : i64 - %58 = llvm.icmp "slt" %57, %34 : i64 - %59 = llvm.add %57, %36 : i64 - %60 = llvm.select %58, %59, %57 : i1, i64 - %61 = llvm.icmp "slt" %56, %34 : i64 - %62 = llvm.sub %37, %56 : i64 - %63 = llvm.select %61, %62, %56 : i1, i64 - %64 = llvm.sdiv %63, %36 : i64 - %65 = llvm.sub %37, %64 : i64 - %66 = llvm.select %61, %65, %64 : i1, i64 - %67 = llvm.mul %34, %35 : i64 - %68 = llvm.mlir.constant(49 : index) : i64 - %69 = llvm.mul %66, %68 : i64 - %70 = llvm.add %67, %69 : i64 - %71 = llvm.mul %60, %36 : i64 - %72 = llvm.add %70, %71 : i64 - %73 = llvm.add %72, %50 : i64 - %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %75 = llvm.load %74 : !llvm.ptr -> f16 - %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %77 = llvm.load %76 : !llvm.ptr -> f16 - %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %79 = llvm.load %78 : !llvm.ptr -> f16 - %80 = llvm.fadd %77, %79 : f16 - %81 = llvm.fcmp "ogt" %75, %33 : f16 - %82 = llvm.select %81, %80, %33 : i1, f16 - %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %82, %83 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %42 = llvm.mul %39, %37 : i64 + %43 = llvm.add %41, %42 : i64 + %44 = nvvm.read.ptx.sreg.nctaid.x : i32 + %45 = llvm.sext %44 : i32 to i64 + %46 = llvm.mul %39, %45 : i64 + llvm.br ^bb1(%43 : i64) + ^bb1(%47: i64): // 2 preds: ^bb0, ^bb2 + %48 = llvm.icmp "slt" %47, %33 : i64 + llvm.cond_br %48, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %49 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %50 = llvm.insertvalue %47, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.mlir.constant(1 : index) : i64 + %52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %33, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %51, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.mlir.constant(49 : index) : i64 + %56 = llvm.insertvalue %55, %54[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %51, %56[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.mlir.constant(7 : index) : i64 + %59 = llvm.getelementptr %arg1[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.mul %35, %33 : i64 + %61 = llvm.mul %35, %55 : i64 + %62 = llvm.add %60, %61 : i64 + %63 = llvm.mul %35, %58 : i64 + %64 = llvm.add %62, %63 : i64 + %65 = llvm.add %64, %35 : i64 + %66 = llvm.getelementptr %59[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.load %66 : !llvm.ptr -> f16 + %68 = llvm.insertvalue %47, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %69 = llvm.insertvalue %51, %68[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %70 = llvm.insertvalue %33, %69[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %71 = llvm.insertvalue %51, %70[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %55, %71[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %51, %72[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.getelementptr %arg12[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %75 = llvm.getelementptr %74[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %76 = llvm.load %75 : !llvm.ptr -> f16 + %77 = llvm.insertvalue %47, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %78 = llvm.insertvalue %51, %77[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %79 = llvm.insertvalue %33, %78[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %80 = llvm.insertvalue %51, %79[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %81 = llvm.insertvalue %55, %80[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %82 = llvm.insertvalue %51, %81[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %83 = llvm.getelementptr %arg23[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %84 = llvm.getelementptr %83[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %85 = llvm.load %84 : !llvm.ptr -> f16 + %86 = llvm.fadd %67, %76 : f16 + %87 = llvm.fcmp "ogt" %85, %34 : f16 + %88 = llvm.select %87, %86, %34 : i1, f16 + %89 = llvm.insertvalue %47, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %90 = llvm.insertvalue %51, %89[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %91 = llvm.insertvalue %33, %90[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %92 = llvm.insertvalue %51, %91[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %93 = llvm.insertvalue %55, %92[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %94 = llvm.insertvalue %51, %93[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %95 = llvm.getelementptr %arg34[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %96 = llvm.getelementptr %95[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %88, %96 : f16, !llvm.ptr + %97 = llvm.add %47, %46 : i64 + llvm.br ^bb1(%97 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown4(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown4(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3054,62 +1769,70 @@ module attributes {byre.container_module, gpu.container_module} { %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(25088 : index) : i64 - %28 = llvm.mlir.constant(7 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 + %25 = llvm.mlir.constant(25088 : index) : i64 + %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %27 = llvm.mlir.constant(0 : index) : i64 + %28 = nvvm.read.ptx.sreg.ctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = nvvm.read.ptx.sreg.ntid.x : i32 %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 + %32 = nvvm.read.ptx.sreg.tid.x : i32 %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(49 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fcmp "ogt" %67, %25 : f16 - %71 = llvm.select %70, %69, %25 : i1, f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %34 = llvm.mul %31, %29 : i64 + %35 = llvm.add %33, %34 : i64 + %36 = nvvm.read.ptx.sreg.nctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = llvm.mul %31, %37 : i64 + llvm.br ^bb1(%35 : i64) + ^bb1(%39: i64): // 2 preds: ^bb0, ^bb2 + %40 = llvm.icmp "slt" %39, %25 : i64 + llvm.cond_br %40, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %43 = llvm.mlir.constant(1 : index) : i64 + %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.mlir.constant(49 : index) : i64 + %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.mlir.constant(7 : index) : i64 + %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %52 = llvm.mul %27, %25 : i64 + %53 = llvm.mul %27, %47 : i64 + %54 = llvm.add %52, %53 : i64 + %55 = llvm.mul %27, %50 : i64 + %56 = llvm.add %54, %55 : i64 + %57 = llvm.add %56, %27 : i64 + %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.load %58 : !llvm.ptr -> f16 + %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %68 = llvm.load %67 : !llvm.ptr -> f16 + %69 = llvm.fcmp "ogt" %59, %26 : f16 + %70 = llvm.select %69, %68, %26 : i1, f16 + %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %70, %78 : f16, !llvm.ptr + %79 = llvm.add %39, %38 : i64 + llvm.br ^bb1(%79 : i64) + ^bb3: // pred: ^bb1 llvm.return } - llvm.func @Unknown0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: !llvm.ptr, %arg19: !llvm.ptr, %arg20: i64, %arg21: i64, %arg22: i64, %arg23: i64, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown0(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: !llvm.ptr {llvm.noalias}, %arg19: !llvm.ptr {llvm.noalias}, %arg20: i64, %arg21: i64, %arg22: i64, %arg23: i64, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> @@ -3132,64 +1855,69 @@ module attributes {byre.container_module, gpu.container_module} { %19 = llvm.insertvalue %arg22, %18[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %20 = llvm.insertvalue %arg26, %19[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %21 = llvm.insertvalue %arg23, %20[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %23 = llvm.mlir.constant(4.900000e+01 : f16) : f16 - %24 = llvm.mlir.constant(0 : index) : i64 - %25 = llvm.mlir.constant(25088 : index) : i64 - %26 = llvm.mlir.constant(7 : index) : i64 - %27 = llvm.mlir.constant(-1 : index) : i64 - %28 = nvvm.read.ptx.sreg.ctaid.x : i32 - %29 = llvm.sext %28 : i32 to i64 - %30 = nvvm.read.ptx.sreg.ntid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.tid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = llvm.mul %31, %29 : i64 - %35 = llvm.add %33, %34 : i64 - %36 = llvm.icmp "slt" %35, %25 : i64 - llvm.cond_br %36, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %37 = llvm.srem %35, %26 : i64 - %38 = llvm.icmp "slt" %37, %24 : i64 - %39 = llvm.add %37, %26 : i64 - %40 = llvm.select %38, %39, %37 : i1, i64 - %41 = llvm.icmp "slt" %35, %24 : i64 - %42 = llvm.sub %27, %35 : i64 - %43 = llvm.select %41, %42, %35 : i1, i64 - %44 = llvm.sdiv %43, %26 : i64 - %45 = llvm.sub %27, %44 : i64 - %46 = llvm.select %41, %45, %44 : i1, i64 - %47 = llvm.srem %46, %26 : i64 - %48 = llvm.icmp "slt" %47, %24 : i64 - %49 = llvm.add %47, %26 : i64 - %50 = llvm.select %48, %49, %47 : i1, i64 - %51 = llvm.icmp "slt" %46, %24 : i64 - %52 = llvm.sub %27, %46 : i64 - %53 = llvm.select %51, %52, %46 : i1, i64 - %54 = llvm.sdiv %53, %26 : i64 - %55 = llvm.sub %27, %54 : i64 - %56 = llvm.select %51, %55, %54 : i1, i64 - %57 = llvm.mul %24, %25 : i64 - %58 = llvm.mlir.constant(49 : index) : i64 - %59 = llvm.mul %56, %58 : i64 - %60 = llvm.add %57, %59 : i64 - %61 = llvm.mul %50, %26 : i64 - %62 = llvm.add %60, %61 : i64 - %63 = llvm.add %62, %40 : i64 - %64 = llvm.getelementptr %arg8[%63] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %65 = llvm.load %64 : !llvm.ptr -> f16 - %66 = llvm.mlir.constant(512 : index) : i64 - %67 = llvm.mul %24, %66 : i64 - %68 = llvm.add %67, %56 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %70 = llvm.load %69 : !llvm.ptr -> f16 - %71 = llvm.fdiv %70, %23 : f16 - %72 = llvm.fcmp "ogt" %65, %22 : f16 - %73 = llvm.select %72, %71, %22 : i1, f16 - %74 = llvm.getelementptr %arg19[%63] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %73, %74 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + %22 = llvm.mlir.constant(49 : index) : i64 + %23 = llvm.mlir.constant(25088 : index) : i64 + %24 = llvm.mlir.constant(4.900000e+01 : f16) : f16 + %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %26 = llvm.mlir.constant(0 : index) : i64 + %27 = nvvm.read.ptx.sreg.ctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = nvvm.read.ptx.sreg.ntid.x : i32 + %30 = llvm.sext %29 : i32 to i64 + %31 = nvvm.read.ptx.sreg.tid.x : i32 + %32 = llvm.sext %31 : i32 to i64 + %33 = llvm.mul %30, %28 : i64 + %34 = llvm.add %32, %33 : i64 + %35 = nvvm.read.ptx.sreg.nctaid.x : i32 + %36 = llvm.sext %35 : i32 to i64 + %37 = llvm.mul %30, %36 : i64 + llvm.br ^bb1(%34 : i64) + ^bb1(%38: i64): // 2 preds: ^bb0, ^bb2 + %39 = llvm.icmp "slt" %38, %23 : i64 + llvm.cond_br %39, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %40 = llvm.sdiv %38, %22 : i64 + %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %42 = llvm.insertvalue %40, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %43 = llvm.mlir.constant(1 : index) : i64 + %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %45 = llvm.mlir.constant(512 : index) : i64 + %46 = llvm.getelementptr %arg1[%40] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %47 = llvm.mul %26, %45 : i64 + %48 = llvm.add %47, %26 : i64 + %49 = llvm.getelementptr %46[%48] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %50 = llvm.load %49 : !llvm.ptr -> f16 + %51 = llvm.insertvalue %38, %7[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %52 = llvm.insertvalue %43, %51[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %23, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %43, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %22, %54[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %43, %55[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.mlir.constant(7 : index) : i64 + %58 = llvm.getelementptr %arg8[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.mul %26, %23 : i64 + %60 = llvm.mul %26, %22 : i64 + %61 = llvm.add %59, %60 : i64 + %62 = llvm.mul %26, %57 : i64 + %63 = llvm.add %61, %62 : i64 + %64 = llvm.add %63, %26 : i64 + %65 = llvm.getelementptr %58[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %66 = llvm.load %65 : !llvm.ptr -> f16 + %67 = llvm.fdiv %50, %24 : f16 + %68 = llvm.fcmp "ogt" %66, %25 : f16 + %69 = llvm.select %68, %67, %25 : i1, f16 + %70 = llvm.insertvalue %38, %15[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %71 = llvm.insertvalue %43, %70[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %23, %71[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %43, %72[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %22, %73[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %43, %74[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.getelementptr %arg19[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %77 = llvm.getelementptr %76[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %69, %77 : f16, !llvm.ptr + %78 = llvm.add %38, %37 : i64 + llvm.br ^bb1(%78 : i64) + ^bb3: // pred: ^bb1 llvm.return } } diff --git a/compiler/test/E2E/ResNet18/BW/2_linalg_tensor_opt.mlir b/compiler/test/E2E/ResNet18/BW/2_linalg_tensor_opt.mlir index 902811a2f..10309ea20 100644 --- a/compiler/test/E2E/ResNet18/BW/2_linalg_tensor_opt.mlir +++ b/compiler/test/E2E/ResNet18/BW/2_linalg_tensor_opt.mlir @@ -13,10 +13,10 @@ module { return %5 : tensor<1x512x7x7xf16> } func.func private @BatchNormGradOp1(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %1 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) + %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> + %1 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> + %2 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> + %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> } @@ -37,25 +37,6 @@ module { %2 = mhlo.select %1, %arg1, %0 : tensor<1x512x7x7xi1>, tensor<1x512x7x7xf16> return %2 : tensor<1x512x7x7xf16> } - func.func private @BatchNormGradOp5(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %1 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp6(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<1x512x7x7xf16> - return %2 : tensor<1x512x7x7xf16> - } - func.func private @ConvBackwardFilterOp7(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } func.func private @Unknown8(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>, %arg2: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16> %1 = mhlo.add %arg0, %arg1 : tensor<1x512x7x7xf16> @@ -63,39 +44,6 @@ module { %3 = mhlo.select %2, %1, %0 : tensor<1x512x7x7xi1>, tensor<1x512x7x7xf16> return %3 : tensor<1x512x7x7xf16> } - func.func private @BatchNormGradOp9(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %1 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp10(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<1x512x7x7xf16> - return %2 : tensor<1x512x7x7xf16> - } - func.func private @ConvBackwardFilterOp11(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown12(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16> - %1 = mhlo.compare GT, %arg0, %0 : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xi1> - %2 = mhlo.select %1, %arg1, %0 : tensor<1x512x7x7xi1>, tensor<1x512x7x7xf16> - return %2 : tensor<1x512x7x7xf16> - } - func.func private @BatchNormGradOp13(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %1 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } func.func private @ConvBackwardDataOp14(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x256x3x3xf16>) -> tensor<3x3x256x512xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x512xf16>) -> tensor<3x3x256x512xf16> @@ -107,14 +55,6 @@ module { %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x512xf16>) -> tensor<512x256x3x3xf16> return %1 : tensor<512x256x3x3xf16> } - func.func private @BatchNormGradOp16(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %1 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } func.func private @ConvBackwardDataOp17(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x256x1x1xf16>) -> tensor<1x1x256x512xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<1x1x256x512xf16>) -> tensor<1x256x14x14xf16> @@ -133,10 +73,10 @@ module { return %3 : tensor<1x256x14x14xf16> } func.func private @BatchNormGradOp20(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %1 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) + %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> + %1 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> + %2 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> + %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> } @@ -157,65 +97,6 @@ module { %2 = mhlo.select %1, %arg1, %0 : tensor<1x256x14x14xi1>, tensor<1x256x14x14xf16> return %2 : tensor<1x256x14x14xf16> } - func.func private @BatchNormGradOp24(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %1 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp25(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<1x256x14x14xf16> - return %2 : tensor<1x256x14x14xf16> - } - func.func private @ConvBackwardFilterOp26(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown27(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>, %arg2: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<1x256x14x14xf16> - %2 = mhlo.compare GT, %arg2, %0 : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xi1> - %3 = mhlo.select %2, %1, %0 : tensor<1x256x14x14xi1>, tensor<1x256x14x14xf16> - return %3 : tensor<1x256x14x14xf16> - } - func.func private @BatchNormGradOp28(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %1 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp29(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<1x256x14x14xf16> - return %2 : tensor<1x256x14x14xf16> - } - func.func private @ConvBackwardFilterOp30(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown31(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16> - %1 = mhlo.compare GT, %arg0, %0 : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xi1> - %2 = mhlo.select %1, %arg1, %0 : tensor<1x256x14x14xi1>, tensor<1x256x14x14xf16> - return %2 : tensor<1x256x14x14xf16> - } - func.func private @BatchNormGradOp32(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %1 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } func.func private @ConvBackwardDataOp33(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x128x3x3xf16>) -> tensor<3x3x128x256xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x256xf16>) -> tensor<3x3x128x256xf16> @@ -227,14 +108,6 @@ module { %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x256xf16>) -> tensor<256x128x3x3xf16> return %1 : tensor<256x128x3x3xf16> } - func.func private @BatchNormGradOp35(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %1 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } func.func private @ConvBackwardDataOp36(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x128x1x1xf16>) -> tensor<1x1x128x256xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<1x1x128x256xf16>) -> tensor<1x128x28x28xf16> @@ -253,10 +126,10 @@ module { return %3 : tensor<1x128x28x28xf16> } func.func private @BatchNormGradOp39(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %1 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) + %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> + %1 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> + %2 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> + %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> } @@ -277,65 +150,6 @@ module { %2 = mhlo.select %1, %arg1, %0 : tensor<1x128x28x28xi1>, tensor<1x128x28x28xf16> return %2 : tensor<1x128x28x28xf16> } - func.func private @BatchNormGradOp43(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %1 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp44(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<1x128x28x28xf16> - return %2 : tensor<1x128x28x28xf16> - } - func.func private @ConvBackwardFilterOp45(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown46(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>, %arg2: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<1x128x28x28xf16> - %2 = mhlo.compare GT, %arg2, %0 : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xi1> - %3 = mhlo.select %2, %1, %0 : tensor<1x128x28x28xi1>, tensor<1x128x28x28xf16> - return %3 : tensor<1x128x28x28xf16> - } - func.func private @BatchNormGradOp47(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %1 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp48(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<1x128x28x28xf16> - return %2 : tensor<1x128x28x28xf16> - } - func.func private @ConvBackwardFilterOp49(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown50(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16> - %1 = mhlo.compare GT, %arg0, %0 : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xi1> - %2 = mhlo.select %1, %arg1, %0 : tensor<1x128x28x28xi1>, tensor<1x128x28x28xf16> - return %2 : tensor<1x128x28x28xf16> - } - func.func private @BatchNormGradOp51(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %1 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } func.func private @ConvBackwardDataOp52(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x64x3x3xf16>) -> tensor<3x3x64x128xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x128xf16>) -> tensor<3x3x64x128xf16> @@ -347,14 +161,6 @@ module { %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x128xf16>) -> tensor<128x64x3x3xf16> return %1 : tensor<128x64x3x3xf16> } - func.func private @BatchNormGradOp54(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %1 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } func.func private @ConvBackwardDataOp55(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x64x1x1xf16>) -> tensor<1x1x64x128xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<1x1x64x128xf16>) -> tensor<1x64x56x56xf16> @@ -373,10 +179,10 @@ module { return %3 : tensor<1x64x56x56xf16> } func.func private @BatchNormGradOp58(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %1 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) + %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> + %1 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> + %2 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> + %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> } @@ -397,76 +203,6 @@ module { %2 = mhlo.select %1, %arg1, %0 : tensor<1x64x56x56xi1>, tensor<1x64x56x56xf16> return %2 : tensor<1x64x56x56xf16> } - func.func private @BatchNormGradOp62(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %1 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp63(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16> - return %2 : tensor<1x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp64(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown65(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>, %arg2: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x64x56x56xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<1x64x56x56xf16> - %2 = mhlo.compare GT, %arg2, %0 : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xi1> - %3 = mhlo.select %2, %1, %0 : tensor<1x64x56x56xi1>, tensor<1x64x56x56xf16> - return %3 : tensor<1x64x56x56xf16> - } - func.func private @BatchNormGradOp66(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %1 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp67(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16> - return %2 : tensor<1x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp68(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown69(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x64x56x56xf16> - %1 = mhlo.compare GT, %arg0, %0 : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xi1> - %2 = mhlo.select %1, %arg1, %0 : tensor<1x64x56x56xi1>, tensor<1x64x56x56xf16> - return %2 : tensor<1x64x56x56xf16> - } - func.func private @BatchNormGradOp70(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %1 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp71(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16> - return %2 : tensor<1x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp72(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } func.func private @Unknown73(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.add %arg0, %arg1 : tensor<1x64x56x56xf16> return %0 : tensor<1x64x56x56xf16> @@ -478,10 +214,10 @@ module { return %2 : tensor<1x64x112x112xf16> } func.func private @BatchNormGradOp75(%arg0: tensor<1x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf32> - %1 = mhlo.convert %arg2 : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x112x112xf32>) -> (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>) + %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> + %1 = mhlo.convert %arg0 : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf32> + %2 = mhlo.convert %arg2 : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf32> + %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x112x112xf32>) -> (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>) %3 = mhlo.convert %grad_operand : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf16> return %3, %grad_scale, %grad_offset : tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32> } @@ -494,215 +230,170 @@ module { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> return %0 : tensor<64x3x7x7xf32> } - func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1x1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<1x1000xf16>) -> tensor<1x1000xf32> - return %0 : tensor<1x1000xf32> - } - func.func private @Unknown79(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<1000xf32>) -> tensor<1000xf16> - %1 = mhlo.convert %0 : (tensor<1000xf16>) -> tensor<1000xf32> - return %1 : tensor<1000xf32> + %1 = mhlo.reshape %0 : (tensor<1x1000xf32>) -> tensor<1000xf32> + %2 = mhlo.convert %1 : (tensor<1000xf32>) -> tensor<1000xf16> + %3 = mhlo.convert %2 : (tensor<1000xf16>) -> tensor<1000xf32> + return %3 : tensor<1000xf32> } - func.func private @Unknown80(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown79(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<1000x512xf16>) -> tensor<1000x512xf32> return %0 : tensor<1000x512xf32> } - func.func private @Unknown81(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - return %0 : tensor<64x64x3x3xf32> - } - func.func private @Unknown82(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown80(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> return %0 : tensor<64x64x3x3xf32> } - func.func private @Unknown83(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - return %0 : tensor<64x64x3x3xf32> - } - func.func private @Unknown84(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - return %0 : tensor<64x64x3x3xf32> - } - func.func private @Unknown85(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown84(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> return %0 : tensor<128x64x3x3xf32> } - func.func private @Unknown86(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown85(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> return %0 : tensor<128x128x3x3xf32> } - func.func private @Unknown87(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown86(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> return %0 : tensor<128x64x1x1xf32> } - func.func private @Unknown88(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - return %0 : tensor<128x128x3x3xf32> - } - func.func private @Unknown89(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - return %0 : tensor<128x128x3x3xf32> - } - func.func private @Unknown90(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown89(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> return %0 : tensor<256x128x3x3xf32> } - func.func private @Unknown91(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown90(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> return %0 : tensor<256x256x3x3xf32> } - func.func private @Unknown92(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown91(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> return %0 : tensor<256x128x1x1xf32> } - func.func private @Unknown93(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - return %0 : tensor<256x256x3x3xf32> - } - func.func private @Unknown94(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - return %0 : tensor<256x256x3x3xf32> - } - func.func private @Unknown95(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown94(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> return %0 : tensor<512x256x3x3xf32> } - func.func private @Unknown96(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown95(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> return %0 : tensor<512x512x3x3xf32> } - func.func private @Unknown97(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown96(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> return %0 : tensor<512x256x1x1xf32> } - func.func private @Unknown98(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - return %0 : tensor<512x512x3x3xf32> - } - func.func private @Unknown99(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - return %0 : tensor<512x512x3x3xf32> - } func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<128xf32>, %arg11: tensor<128xf32>, %arg12: tensor<128xf32>, %arg13: tensor<128xf32>, %arg14: tensor<128xf32>, %arg15: tensor<128xf32>, %arg16: tensor<128xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<256xf32>, %arg21: tensor<256xf32>, %arg22: tensor<256xf32>, %arg23: tensor<256xf32>, %arg24: tensor<256xf32>, %arg25: tensor<256xf32>, %arg26: tensor<256xf32>, %arg27: tensor<256xf32>, %arg28: tensor<256xf32>, %arg29: tensor<256xf32>, %arg30: tensor<512xf32>, %arg31: tensor<512xf32>, %arg32: tensor<512xf32>, %arg33: tensor<512xf32>, %arg34: tensor<512xf32>, %arg35: tensor<512xf32>, %arg36: tensor<512xf32>, %arg37: tensor<512xf32>, %arg38: tensor<512xf32>, %arg39: tensor<512xf32>, %arg40: tensor<64xf32>, %arg41: tensor<64xf32>, %arg42: tensor<64xf32>, %arg43: tensor<64xf32>, %arg44: tensor<64xf32>, %arg45: tensor<64xf32>, %arg46: tensor<64xf32>, %arg47: tensor<64xf32>, %arg48: tensor<64xf32>, %arg49: tensor<64xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<128xf32>, %arg53: tensor<128xf32>, %arg54: tensor<128xf32>, %arg55: tensor<128xf32>, %arg56: tensor<128xf32>, %arg57: tensor<128xf32>, %arg58: tensor<128xf32>, %arg59: tensor<128xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<512xf32>, %arg71: tensor<512xf32>, %arg72: tensor<512xf32>, %arg73: tensor<512xf32>, %arg74: tensor<512xf32>, %arg75: tensor<512xf32>, %arg76: tensor<512xf32>, %arg77: tensor<512xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<64x3x7x7xf16>, %arg81: tensor<1x3x224x224xf16>, %arg82: tensor<1x64x112x112xf16>, %arg83: tensor<1x64x112x112xf16>, %arg84: tensor<1x64x56x56xf16>, %arg85: tensor<64x64x3x3xf16>, %arg86: tensor<1x64x56x56xf16>, %arg87: tensor<1x64x56x56xf16>, %arg88: tensor<64x64x3x3xf16>, %arg89: tensor<1x64x56x56xf16>, %arg90: tensor<1x64x56x56xf16>, %arg91: tensor<64x64x3x3xf16>, %arg92: tensor<1x64x56x56xf16>, %arg93: tensor<1x64x56x56xf16>, %arg94: tensor<64x64x3x3xf16>, %arg95: tensor<1x64x56x56xf16>, %arg96: tensor<1x64x56x56xf16>, %arg97: tensor<128x64x3x3xf16>, %arg98: tensor<1x128x28x28xf16>, %arg99: tensor<1x128x28x28xf16>, %arg100: tensor<128x128x3x3xf16>, %arg101: tensor<1x128x28x28xf16>, %arg102: tensor<128x64x1x1xf16>, %arg103: tensor<1x128x28x28xf16>, %arg104: tensor<1x128x28x28xf16>, %arg105: tensor<128x128x3x3xf16>, %arg106: tensor<1x128x28x28xf16>, %arg107: tensor<1x128x28x28xf16>, %arg108: tensor<128x128x3x3xf16>, %arg109: tensor<1x128x28x28xf16>, %arg110: tensor<1x128x28x28xf16>, %arg111: tensor<256x128x3x3xf16>, %arg112: tensor<1x256x14x14xf16>, %arg113: tensor<1x256x14x14xf16>, %arg114: tensor<256x256x3x3xf16>, %arg115: tensor<1x256x14x14xf16>, %arg116: tensor<256x128x1x1xf16>, %arg117: tensor<1x256x14x14xf16>, %arg118: tensor<1x256x14x14xf16>, %arg119: tensor<256x256x3x3xf16>, %arg120: tensor<1x256x14x14xf16>, %arg121: tensor<1x256x14x14xf16>, %arg122: tensor<256x256x3x3xf16>, %arg123: tensor<1x256x14x14xf16>, %arg124: tensor<1x256x14x14xf16>, %arg125: tensor<512x256x3x3xf16>, %arg126: tensor<1x512x7x7xf16>, %arg127: tensor<1x512x7x7xf16>, %arg128: tensor<512x512x3x3xf16>, %arg129: tensor<1x512x7x7xf16>, %arg130: tensor<512x256x1x1xf16>, %arg131: tensor<1x512x7x7xf16>, %arg132: tensor<1x512x7x7xf16>, %arg133: tensor<512x512x3x3xf16>, %arg134: tensor<1x512x7x7xf16>, %arg135: tensor<1x512x7x7xf16>, %arg136: tensor<512x512x3x3xf16>, %arg137: tensor<1x512x7x7xf16>, %arg138: tensor<1x512x7x7xf16>, %arg139: tensor<1x512xf16>, %arg140: tensor<512x1000xf16>, %arg141: tensor<1x1000xf16>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>) { - %0 = mhlo.constant dense<0.000000e+00> : tensor - %1 = mhlo.constant dense<0.000000e+00> : tensor - %2 = "mhlo.dot_general"(%arg141, %arg140) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<1x1000xf16>, tensor<512x1000xf16>) -> tensor<1x512xf16> - %3 = call @Unknown0(%2, %arg138) : (tensor<1x512xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %4:3 = call @BatchNormGradOp1(%arg137, %arg39, %3) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %5 = call @ConvBackwardDataOp2(%4#0, %arg136) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %6 = call @ConvBackwardFilterOp3(%arg135, %4#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %7 = call @Unknown4(%arg135, %5) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %8:3 = call @BatchNormGradOp5(%arg134, %arg37, %7) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %9 = call @ConvBackwardDataOp6(%8#0, %arg133) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %10 = call @ConvBackwardFilterOp7(%arg132, %8#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %11 = call @Unknown8(%3, %9, %arg132) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %12:3 = call @BatchNormGradOp9(%arg129, %arg33, %11) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %13 = call @ConvBackwardDataOp10(%12#0, %arg128) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %14 = call @ConvBackwardFilterOp11(%arg127, %12#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %15 = call @Unknown12(%arg127, %13) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %16:3 = call @BatchNormGradOp13(%arg126, %arg31, %15) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %17 = call @ConvBackwardDataOp14(%16#0, %arg125) : (tensor<1x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %18 = call @ConvBackwardFilterOp15(%arg124, %16#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x3x3xf16> - %19:3 = call @BatchNormGradOp16(%arg131, %arg35, %11) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %20 = call @ConvBackwardDataOp17(%19#0, %arg130) : (tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16> - %21 = call @ConvBackwardFilterOp18(%arg124, %19#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x1x1xf16> - %22 = call @Unknown19(%20, %17, %arg124) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %23:3 = call @BatchNormGradOp20(%arg123, %arg29, %22) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %24 = call @ConvBackwardDataOp21(%23#0, %arg122) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %25 = call @ConvBackwardFilterOp22(%arg121, %23#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %26 = call @Unknown23(%arg121, %24) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %27:3 = call @BatchNormGradOp24(%arg120, %arg27, %26) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %28 = call @ConvBackwardDataOp25(%27#0, %arg119) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %29 = call @ConvBackwardFilterOp26(%arg118, %27#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %30 = call @Unknown27(%22, %28, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %31:3 = call @BatchNormGradOp28(%arg115, %arg23, %30) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %32 = call @ConvBackwardDataOp29(%31#0, %arg114) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %33 = call @ConvBackwardFilterOp30(%arg113, %31#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %34 = call @Unknown31(%arg113, %32) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %35:3 = call @BatchNormGradOp32(%arg112, %arg21, %34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %36 = call @ConvBackwardDataOp33(%35#0, %arg111) : (tensor<1x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %37 = call @ConvBackwardFilterOp34(%arg110, %35#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x3x3xf16> - %38:3 = call @BatchNormGradOp35(%arg117, %arg25, %30) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %39 = call @ConvBackwardDataOp36(%38#0, %arg116) : (tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16> - %40 = call @ConvBackwardFilterOp37(%arg110, %38#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x1x1xf16> - %41 = call @Unknown38(%39, %36, %arg110) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %42:3 = call @BatchNormGradOp39(%arg109, %arg19, %41) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %43 = call @ConvBackwardDataOp40(%42#0, %arg108) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %44 = call @ConvBackwardFilterOp41(%arg107, %42#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %45 = call @Unknown42(%arg107, %43) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %46:3 = call @BatchNormGradOp43(%arg106, %arg17, %45) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %47 = call @ConvBackwardDataOp44(%46#0, %arg105) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %48 = call @ConvBackwardFilterOp45(%arg104, %46#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %49 = call @Unknown46(%41, %47, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %50:3 = call @BatchNormGradOp47(%arg101, %arg13, %49) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %51 = call @ConvBackwardDataOp48(%50#0, %arg100) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %52 = call @ConvBackwardFilterOp49(%arg99, %50#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %53 = call @Unknown50(%arg99, %51) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %54:3 = call @BatchNormGradOp51(%arg98, %arg11, %53) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %55 = call @ConvBackwardDataOp52(%54#0, %arg97) : (tensor<1x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %56 = call @ConvBackwardFilterOp53(%arg96, %54#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x3x3xf16> - %57:3 = call @BatchNormGradOp54(%arg103, %arg15, %49) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %58 = call @ConvBackwardDataOp55(%57#0, %arg102) : (tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16> - %59 = call @ConvBackwardFilterOp56(%arg96, %57#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x1x1xf16> - %60 = call @Unknown57(%58, %55, %arg96) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %61:3 = call @BatchNormGradOp58(%arg95, %arg9, %60) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %62 = call @ConvBackwardDataOp59(%61#0, %arg94) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %63 = call @ConvBackwardFilterOp60(%arg93, %61#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %64 = call @Unknown61(%arg93, %62) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %65:3 = call @BatchNormGradOp62(%arg92, %arg7, %64) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %66 = call @ConvBackwardDataOp63(%65#0, %arg91) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %67 = call @ConvBackwardFilterOp64(%arg90, %65#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %68 = call @Unknown65(%60, %66, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %69:3 = call @BatchNormGradOp66(%arg89, %arg5, %68) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %70 = call @ConvBackwardDataOp67(%69#0, %arg88) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %71 = call @ConvBackwardFilterOp68(%arg87, %69#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %72 = call @Unknown69(%arg87, %70) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %73:3 = call @BatchNormGradOp70(%arg86, %arg3, %72) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %74 = call @ConvBackwardDataOp71(%73#0, %arg85) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %75 = call @ConvBackwardFilterOp72(%arg84, %73#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %76 = call @Unknown73(%68, %74) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %77 = "mhlo.select_and_scatter"(%arg83, %76, %1) ({ + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = "mhlo.dot_general"(%arg141, %arg140) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<1x1000xf16>, tensor<512x1000xf16>) -> tensor<1x512xf16> + %2 = call @Unknown0(%1, %arg138) : (tensor<1x512xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %3:3 = call @BatchNormGradOp1(%arg137, %arg39, %2) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %4 = call @ConvBackwardDataOp2(%3#0, %arg136) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %5 = call @ConvBackwardFilterOp3(%arg135, %3#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %6 = call @Unknown4(%arg135, %4) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %7:3 = call @BatchNormGradOp1(%arg134, %arg37, %6) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %8 = call @ConvBackwardDataOp2(%7#0, %arg133) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %9 = call @ConvBackwardFilterOp3(%arg132, %7#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %10 = call @Unknown8(%2, %8, %arg132) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %11:3 = call @BatchNormGradOp1(%arg129, %arg33, %10) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %12 = call @ConvBackwardDataOp2(%11#0, %arg128) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %13 = call @ConvBackwardFilterOp3(%arg127, %11#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %14 = call @Unknown4(%arg127, %12) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %15:3 = call @BatchNormGradOp1(%arg126, %arg31, %14) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %16 = call @ConvBackwardDataOp14(%15#0, %arg125) : (tensor<1x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %17 = call @ConvBackwardFilterOp15(%arg124, %15#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x3x3xf16> + %18:3 = call @BatchNormGradOp1(%arg131, %arg35, %10) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %19 = call @ConvBackwardDataOp17(%18#0, %arg130) : (tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16> + %20 = call @ConvBackwardFilterOp18(%arg124, %18#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x1x1xf16> + %21 = call @Unknown19(%19, %16, %arg124) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %22:3 = call @BatchNormGradOp20(%arg123, %arg29, %21) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %23 = call @ConvBackwardDataOp21(%22#0, %arg122) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %24 = call @ConvBackwardFilterOp22(%arg121, %22#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %25 = call @Unknown23(%arg121, %23) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %26:3 = call @BatchNormGradOp20(%arg120, %arg27, %25) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %27 = call @ConvBackwardDataOp21(%26#0, %arg119) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %28 = call @ConvBackwardFilterOp22(%arg118, %26#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %29 = call @Unknown19(%21, %27, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %30:3 = call @BatchNormGradOp20(%arg115, %arg23, %29) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %31 = call @ConvBackwardDataOp21(%30#0, %arg114) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %32 = call @ConvBackwardFilterOp22(%arg113, %30#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %33 = call @Unknown23(%arg113, %31) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %34:3 = call @BatchNormGradOp20(%arg112, %arg21, %33) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %35 = call @ConvBackwardDataOp33(%34#0, %arg111) : (tensor<1x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %36 = call @ConvBackwardFilterOp34(%arg110, %34#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x3x3xf16> + %37:3 = call @BatchNormGradOp20(%arg117, %arg25, %29) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %38 = call @ConvBackwardDataOp36(%37#0, %arg116) : (tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16> + %39 = call @ConvBackwardFilterOp37(%arg110, %37#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x1x1xf16> + %40 = call @Unknown38(%38, %35, %arg110) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %41:3 = call @BatchNormGradOp39(%arg109, %arg19, %40) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %42 = call @ConvBackwardDataOp40(%41#0, %arg108) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %43 = call @ConvBackwardFilterOp41(%arg107, %41#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %44 = call @Unknown42(%arg107, %42) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %45:3 = call @BatchNormGradOp39(%arg106, %arg17, %44) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %46 = call @ConvBackwardDataOp40(%45#0, %arg105) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %47 = call @ConvBackwardFilterOp41(%arg104, %45#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %48 = call @Unknown38(%40, %46, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %49:3 = call @BatchNormGradOp39(%arg101, %arg13, %48) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %50 = call @ConvBackwardDataOp40(%49#0, %arg100) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %51 = call @ConvBackwardFilterOp41(%arg99, %49#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %52 = call @Unknown42(%arg99, %50) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %53:3 = call @BatchNormGradOp39(%arg98, %arg11, %52) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %54 = call @ConvBackwardDataOp52(%53#0, %arg97) : (tensor<1x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %55 = call @ConvBackwardFilterOp53(%arg96, %53#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x3x3xf16> + %56:3 = call @BatchNormGradOp39(%arg103, %arg15, %48) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %57 = call @ConvBackwardDataOp55(%56#0, %arg102) : (tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16> + %58 = call @ConvBackwardFilterOp56(%arg96, %56#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x1x1xf16> + %59 = call @Unknown57(%57, %54, %arg96) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %60:3 = call @BatchNormGradOp58(%arg95, %arg9, %59) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %61 = call @ConvBackwardDataOp59(%60#0, %arg94) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %62 = call @ConvBackwardFilterOp60(%arg93, %60#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %63 = call @Unknown61(%arg93, %61) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %64:3 = call @BatchNormGradOp58(%arg92, %arg7, %63) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %65 = call @ConvBackwardDataOp59(%64#0, %arg91) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %66 = call @ConvBackwardFilterOp60(%arg90, %64#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %67 = call @Unknown57(%59, %65, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %68:3 = call @BatchNormGradOp58(%arg89, %arg5, %67) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %69 = call @ConvBackwardDataOp59(%68#0, %arg88) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %70 = call @ConvBackwardFilterOp60(%arg87, %68#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %71 = call @Unknown61(%arg87, %69) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %72:3 = call @BatchNormGradOp58(%arg86, %arg3, %71) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %73 = call @ConvBackwardDataOp59(%72#0, %arg85) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %74 = call @ConvBackwardFilterOp60(%arg84, %72#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %75 = call @Unknown73(%67, %73) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %76 = "mhlo.select_and_scatter"(%arg83, %75, %0) ({ ^bb0(%arg142: tensor, %arg143: tensor): - %107 = mhlo.compare GE, %arg142, %arg143 : (tensor, tensor) -> tensor - mhlo.return %107 : tensor + %104 = mhlo.compare GE, %arg142, %arg143 : (tensor, tensor) -> tensor + mhlo.return %104 : tensor }, { ^bb0(%arg142: tensor, %arg143: tensor): - %107 = mhlo.add %arg142, %arg143 : tensor - mhlo.return %107 : tensor + %104 = mhlo.add %arg142, %arg143 : tensor + mhlo.return %104 : tensor }) {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor) -> tensor<1x64x112x112xf16> - %78 = call @Unknown74(%arg83, %77) : (tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> - %79:3 = call @BatchNormGradOp75(%arg82, %arg1, %78) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) - %80 = call @ConvBackwardFilterOp76(%arg81, %79#0) : (tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>) -> tensor<64x3x7x7xf16> - %81 = call @Unknown77(%80) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> - %82 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1x1000xf32> - %83 = mhlo.reduce(%82 init: %0) across dimensions = [0] : (tensor<1x1000xf32>, tensor) -> tensor<1000xf32> - reducer(%arg142: tensor, %arg143: tensor) { - %107 = mhlo.add %arg142, %arg143 : tensor - mhlo.return %107 : tensor - } - %84 = call @Unknown79(%83) : (tensor<1000xf32>) -> tensor<1000xf32> - %85 = mhlo.reshape %arg141 : (tensor<1x1000xf16>) -> tensor<1000x1xf16> - %86 = "mhlo.dot"(%85, %arg139) {precision_config = [#mhlo, #mhlo]} : (tensor<1000x1xf16>, tensor<1x512xf16>) -> tensor<1000x512xf16> - %87 = call @Unknown80(%86) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> - %88 = call @Unknown81(%75) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %89 = call @Unknown82(%71) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %90 = call @Unknown83(%67) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %91 = call @Unknown84(%63) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %92 = call @Unknown85(%56) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> - %93 = call @Unknown86(%52) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %94 = call @Unknown87(%59) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> - %95 = call @Unknown88(%48) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %96 = call @Unknown89(%44) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %97 = call @Unknown90(%37) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> - %98 = call @Unknown91(%33) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %99 = call @Unknown92(%40) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> - %100 = call @Unknown93(%29) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %101 = call @Unknown94(%25) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %102 = call @Unknown95(%18) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> - %103 = call @Unknown96(%14) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %104 = call @Unknown97(%21) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> - %105 = call @Unknown98(%10) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %106 = call @Unknown99(%6) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - return %79#2, %79#1, %81, %84, %87, %73#2, %73#1, %69#2, %69#1, %88, %89, %65#2, %65#1, %61#2, %61#1, %90, %91, %54#2, %54#1, %50#2, %50#1, %92, %93, %94, %57#2, %57#1, %46#2, %46#1, %42#2, %42#1, %95, %96, %35#2, %35#1, %31#2, %31#1, %97, %98, %99, %38#2, %38#1, %27#2, %27#1, %23#2, %23#1, %100, %101, %16#2, %16#1, %12#2, %12#1, %102, %103, %104, %19#2, %19#1, %8#2, %8#1, %4#2, %4#1, %105, %106 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32> + %77 = call @Unknown74(%arg83, %76) : (tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> + %78:3 = call @BatchNormGradOp75(%arg82, %arg1, %77) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) + %79 = call @ConvBackwardFilterOp76(%arg81, %78#0) : (tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>) -> tensor<64x3x7x7xf16> + %80 = call @Unknown77(%79) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> + %81 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1000xf32> + %82 = mhlo.reshape %arg141 : (tensor<1x1000xf16>) -> tensor<1000x1xf16> + %83 = "mhlo.dot"(%82, %arg139) {precision_config = [#mhlo, #mhlo]} : (tensor<1000x1xf16>, tensor<1x512xf16>) -> tensor<1000x512xf16> + %84 = call @Unknown79(%83) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> + %85 = call @Unknown80(%74) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %86 = call @Unknown80(%70) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %87 = call @Unknown80(%66) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %88 = call @Unknown80(%62) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %89 = call @Unknown84(%55) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> + %90 = call @Unknown85(%51) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %91 = call @Unknown86(%58) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> + %92 = call @Unknown85(%47) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %93 = call @Unknown85(%43) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %94 = call @Unknown89(%36) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> + %95 = call @Unknown90(%32) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %96 = call @Unknown91(%39) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> + %97 = call @Unknown90(%28) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %98 = call @Unknown90(%24) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %99 = call @Unknown94(%17) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> + %100 = call @Unknown95(%13) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %101 = call @Unknown96(%20) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> + %102 = call @Unknown95(%9) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %103 = call @Unknown95(%5) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + return %78#2, %78#1, %80, %81, %84, %72#2, %72#1, %68#2, %68#1, %85, %86, %64#2, %64#1, %60#2, %60#1, %87, %88, %53#2, %53#1, %49#2, %49#1, %89, %90, %91, %56#2, %56#1, %45#2, %45#1, %41#2, %41#1, %92, %93, %34#2, %34#1, %30#2, %30#1, %94, %95, %96, %37#2, %37#1, %26#2, %26#1, %22#2, %22#1, %97, %98, %15#2, %15#1, %11#2, %11#1, %99, %100, %101, %18#2, %18#1, %7#2, %7#1, %3#2, %3#1, %102, %103 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/3_byre_tensor_opt.mlir b/compiler/test/E2E/ResNet18/BW/3_byre_tensor_opt.mlir index 08a074f1b..17261feae 100644 --- a/compiler/test/E2E/ResNet18/BW/3_byre_tensor_opt.mlir +++ b/compiler/test/E2E/ResNet18/BW/3_byre_tensor_opt.mlir @@ -2,22 +2,36 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -#map2 = affine_map<(d0, d1) -> (d0, d1)> -#map3 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> module { func.func private @Unknown0(%arg0: tensor<1x512xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %cst_0 = arith.constant 4.900000e+01 : f16 %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : tensor<1x512x7x7xf16>, tensor<1x512xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_1: f16, %out: f16): - %2 = arith.divf %in_1, %cst_0 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x512x7x7xf16> + %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) { + %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) { + %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2] [1, 1] [1, 1] : tensor<1x512xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %6 = arith.divf %in, %cst_0 : f16 + %7 = arith.cmpf ogt, %in_2, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x512x7x7xf16> + scf.yield %inserted_slice : tensor<1x512x7x7xf16> + } + scf.yield %3 : tensor<1x512x7x7xf16> + } + scf.yield %2 : tensor<1x512x7x7xf16> + } return %1 : tensor<1x512x7x7xf16> } func.func private @BatchNormGradOp1(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { @@ -40,85 +54,63 @@ module { return %1 : tensor<512x512x3x3xf16> } func.func private @Unknown4(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x512x7x7xf16> + %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) { + %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) { + %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.cmpf ogt, %in, %cst : f16 + %7 = arith.select %6, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x512x7x7xf16> + scf.yield %inserted_slice : tensor<1x512x7x7xf16> + } + scf.yield %3 : tensor<1x512x7x7xf16> + } + scf.yield %2 : tensor<1x512x7x7xf16> + } return %1 : tensor<1x512x7x7xf16> } - func.func private @BatchNormGradOp5(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %1 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %2 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp6(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<1x512x7x7xf16> - return %2 : tensor<1x512x7x7xf16> - } - func.func private @ConvBackwardFilterOp7(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } func.func private @Unknown8(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>, %arg2: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x512x7x7xf16> - return %1 : tensor<1x512x7x7xf16> - } - func.func private @BatchNormGradOp9(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %1 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %2 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp10(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<1x512x7x7xf16> - return %2 : tensor<1x512x7x7xf16> - } - func.func private @ConvBackwardFilterOp11(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown12(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x512x7x7xf16> + %1 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %0) -> (tensor<1x512x7x7xf16>) { + %2 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x512x7x7xf16>) { + %3 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16): + %6 = arith.addf %in, %in_2 : f16 + %7 = arith.cmpf ogt, %in_3, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x512x7x7xf16> + scf.yield %inserted_slice : tensor<1x512x7x7xf16> + } + scf.yield %3 : tensor<1x512x7x7xf16> + } + scf.yield %2 : tensor<1x512x7x7xf16> + } return %1 : tensor<1x512x7x7xf16> } - func.func private @BatchNormGradOp13(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %1 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %2 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } func.func private @ConvBackwardDataOp14(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x256x3x3xf16>) -> tensor<3x3x256x512xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x512xf16>) -> tensor<3x3x256x512xf16> @@ -130,14 +122,6 @@ module { %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x512xf16>) -> tensor<512x256x3x3xf16> return %1 : tensor<512x256x3x3xf16> } - func.func private @BatchNormGradOp16(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %1 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %2 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } func.func private @ConvBackwardDataOp17(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x256x1x1xf16>) -> tensor<1x1x256x512xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<1x1x256x512xf16>) -> tensor<1x256x14x14xf16> @@ -149,15 +133,33 @@ module { return %1 : tensor<512x256x1x1xf16> } func.func private @Unknown19(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>, %arg2: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x256x14x14xf16> + %1 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %0) -> (tensor<1x256x14x14xf16>) { + %2 = scf.for %arg5 = %c0 to %c14 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x256x14x14xf16>) { + %3 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16): + %6 = arith.addf %in, %in_2 : f16 + %7 = arith.cmpf ogt, %in_3, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x256x14x14xf16> + scf.yield %inserted_slice : tensor<1x256x14x14xf16> + } + scf.yield %3 : tensor<1x256x14x14xf16> + } + scf.yield %2 : tensor<1x256x14x14xf16> + } return %1 : tensor<1x256x14x14xf16> } func.func private @BatchNormGradOp20(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { @@ -180,85 +182,33 @@ module { return %1 : tensor<256x256x3x3xf16> } func.func private @Unknown23(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x256x14x14xf16> - return %1 : tensor<1x256x14x14xf16> - } - func.func private @BatchNormGradOp24(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %1 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %2 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp25(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<1x256x14x14xf16> - return %2 : tensor<1x256x14x14xf16> - } - func.func private @ConvBackwardFilterOp26(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown27(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>, %arg2: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x256x14x14xf16> - return %1 : tensor<1x256x14x14xf16> - } - func.func private @BatchNormGradOp28(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %1 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %2 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp29(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<1x256x14x14xf16> - return %2 : tensor<1x256x14x14xf16> - } - func.func private @ConvBackwardFilterOp30(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown31(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x256x14x14xf16> + %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<1x256x14x14xf16>) { + %2 = scf.for %arg4 = %c0 to %c14 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x256x14x14xf16>) { + %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.cmpf ogt, %in, %cst : f16 + %7 = arith.select %6, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x256x14x14xf16> + scf.yield %inserted_slice : tensor<1x256x14x14xf16> + } + scf.yield %3 : tensor<1x256x14x14xf16> + } + scf.yield %2 : tensor<1x256x14x14xf16> + } return %1 : tensor<1x256x14x14xf16> } - func.func private @BatchNormGradOp32(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %1 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %2 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } func.func private @ConvBackwardDataOp33(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x128x3x3xf16>) -> tensor<3x3x128x256xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x256xf16>) -> tensor<3x3x128x256xf16> @@ -270,14 +220,6 @@ module { %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x256xf16>) -> tensor<256x128x3x3xf16> return %1 : tensor<256x128x3x3xf16> } - func.func private @BatchNormGradOp35(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %1 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %2 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } func.func private @ConvBackwardDataOp36(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x128x1x1xf16>) -> tensor<1x1x128x256xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<1x1x128x256xf16>) -> tensor<1x128x28x28xf16> @@ -289,15 +231,33 @@ module { return %1 : tensor<256x128x1x1xf16> } func.func private @Unknown38(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>, %arg2: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x128x28x28xf16> + %1 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %0) -> (tensor<1x128x28x28xf16>) { + %2 = scf.for %arg5 = %c0 to %c28 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x128x28x28xf16>) { + %3 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16): + %6 = arith.addf %in, %in_2 : f16 + %7 = arith.cmpf ogt, %in_3, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x128x28x28xf16> + scf.yield %inserted_slice : tensor<1x128x28x28xf16> + } + scf.yield %3 : tensor<1x128x28x28xf16> + } + scf.yield %2 : tensor<1x128x28x28xf16> + } return %1 : tensor<1x128x28x28xf16> } func.func private @BatchNormGradOp39(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { @@ -320,85 +280,33 @@ module { return %1 : tensor<128x128x3x3xf16> } func.func private @Unknown42(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x128x28x28xf16> - return %1 : tensor<1x128x28x28xf16> - } - func.func private @BatchNormGradOp43(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %1 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %2 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp44(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<1x128x28x28xf16> - return %2 : tensor<1x128x28x28xf16> - } - func.func private @ConvBackwardFilterOp45(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown46(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>, %arg2: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x128x28x28xf16> - return %1 : tensor<1x128x28x28xf16> - } - func.func private @BatchNormGradOp47(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %1 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %2 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp48(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<1x128x28x28xf16> - return %2 : tensor<1x128x28x28xf16> - } - func.func private @ConvBackwardFilterOp49(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown50(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x128x28x28xf16> + %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<1x128x28x28xf16>) { + %2 = scf.for %arg4 = %c0 to %c28 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x128x28x28xf16>) { + %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.cmpf ogt, %in, %cst : f16 + %7 = arith.select %6, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x128x28x28xf16> + scf.yield %inserted_slice : tensor<1x128x28x28xf16> + } + scf.yield %3 : tensor<1x128x28x28xf16> + } + scf.yield %2 : tensor<1x128x28x28xf16> + } return %1 : tensor<1x128x28x28xf16> } - func.func private @BatchNormGradOp51(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %1 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %2 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } func.func private @ConvBackwardDataOp52(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x64x3x3xf16>) -> tensor<3x3x64x128xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x128xf16>) -> tensor<3x3x64x128xf16> @@ -410,14 +318,6 @@ module { %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x128xf16>) -> tensor<128x64x3x3xf16> return %1 : tensor<128x64x3x3xf16> } - func.func private @BatchNormGradOp54(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %1 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %2 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } func.func private @ConvBackwardDataOp55(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x64x1x1xf16>) -> tensor<1x1x64x128xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<1x1x64x128xf16>) -> tensor<1x64x56x56xf16> @@ -429,15 +329,33 @@ module { return %1 : tensor<128x64x1x1xf16> } func.func private @Unknown57(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>, %arg2: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x64x56x56xf16> + %1 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %0) -> (tensor<1x64x56x56xf16>) { + %2 = scf.for %arg5 = %c0 to %c56 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x56x56xf16>) { + %3 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16): + %6 = arith.addf %in, %in_2 : f16 + %7 = arith.cmpf ogt, %in_3, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x56x56xf16> + scf.yield %inserted_slice : tensor<1x64x56x56xf16> + } + scf.yield %3 : tensor<1x64x56x56xf16> + } + scf.yield %2 : tensor<1x64x56x56xf16> + } return %1 : tensor<1x64x56x56xf16> } func.func private @BatchNormGradOp58(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { @@ -460,114 +378,85 @@ module { return %1 : tensor<64x64x3x3xf16> } func.func private @Unknown61(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x64x56x56xf16> - return %1 : tensor<1x64x56x56xf16> - } - func.func private @BatchNormGradOp62(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %1 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %2 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp63(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16> - return %2 : tensor<1x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp64(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown65(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>, %arg2: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x64x56x56xf16> - return %1 : tensor<1x64x56x56xf16> - } - func.func private @BatchNormGradOp66(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %1 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %2 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp67(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16> - return %2 : tensor<1x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp68(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown69(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x64x56x56xf16> + %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) { + %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) { + %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.cmpf ogt, %in, %cst : f16 + %7 = arith.select %6, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x56x56xf16> + scf.yield %inserted_slice : tensor<1x64x56x56xf16> + } + scf.yield %3 : tensor<1x64x56x56xf16> + } + scf.yield %2 : tensor<1x64x56x56xf16> + } return %1 : tensor<1x64x56x56xf16> } - func.func private @BatchNormGradOp70(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %1 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %2 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp71(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16> - return %2 : tensor<1x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp72(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } func.func private @Unknown73(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - linalg.yield %2 : f16 - } -> tensor<1x64x56x56xf16> + %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) { + %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) { + %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.addf %in, %in_1 : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x56x56xf16> + scf.yield %inserted_slice : tensor<1x64x56x56xf16> + } + scf.yield %3 : tensor<1x64x56x56xf16> + } + scf.yield %2 : tensor<1x64x56x56xf16> + } return %1 : tensor<1x64x56x56xf16> } func.func private @Unknown74(%arg0: tensor<1x64x112x112xf16>, %arg1: tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + %c112 = arith.constant 112 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x112x112xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) outs(%0 : tensor<1x64x112x112xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x64x112x112xf16> + %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x112x112xf16>) { + %2 = scf.for %arg4 = %c0 to %c112 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x112x112xf16>) { + %3 = scf.for %arg6 = %c0 to %c112 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x112x112xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.cmpf ogt, %in, %cst : f16 + %7 = arith.select %6, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x112x112xf16> + scf.yield %inserted_slice : tensor<1x64x112x112xf16> + } + scf.yield %3 : tensor<1x64x112x112xf16> + } + scf.yield %2 : tensor<1x64x112x112xf16> + } return %1 : tensor<1x64x112x112xf16> } func.func private @BatchNormGradOp75(%arg0: tensor<1x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { @@ -584,333 +473,455 @@ module { return %1 : tensor<64x3x7x7xf16> } func.func private @Unknown77(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x3x7x7xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf16>) outs(%0 : tensor<64x3x7x7xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x3x7x7xf32> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf32>) { + %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf32>) { + %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf32>) { + %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x3x7x7xf32> + scf.yield %inserted_slice : tensor<64x3x7x7xf32> + } + scf.yield %4 : tensor<64x3x7x7xf32> + } + scf.yield %3 : tensor<64x3x7x7xf32> + } + scf.yield %2 : tensor<64x3x7x7xf32> + } return %1 : tensor<64x3x7x7xf32> } - func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1x1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1x1000xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x1000xf16>) outs(%0 : tensor<1x1000xf32>) { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<1x1000xf32> - return %1 : tensor<1x1000xf32> - } - func.func private @Unknown79(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<1000xf32> - %1 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%arg0 : tensor<1000xf32>) outs(%0 : tensor<1000xf32>) { - ^bb0(%in: f32, %out: f32): - %2 = arith.truncf %in : f32 to f16 - %3 = arith.extf %2 : f16 to f32 - linalg.yield %3 : f32 - } -> tensor<1000xf32> - return %1 : tensor<1000xf32> - } - func.func private @Unknown80(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1x1000xf32>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1] [1, 1] [1, 1] : tensor<1x1000xf16> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%2 : tensor) { + ^bb0(%in: f16, %out: f32): + %4 = arith.extf %in : f16 to f32 + %5 = arith.truncf %4 : f32 to f16 + %6 = arith.extf %5 : f16 to f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg2[0, %arg1] [1, 1] [1, 1] : tensor into tensor<1x1000xf32> + scf.yield %inserted_slice : tensor<1x1000xf32> + } + %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x1000xf32> into tensor<1000xf32> + return %collapsed : tensor<1000xf32> + } + func.func private @Unknown79(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1000x512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf16>) outs(%0 : tensor<1000x512xf32>) { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<1000x512xf32> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf32>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<1000x512xf32> + scf.yield %inserted_slice : tensor<1000x512xf32> + } + scf.yield %2 : tensor<1000x512xf32> + } return %1 : tensor<1000x512xf32> } - func.func private @Unknown81(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown82(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown83(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown80(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown84(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x64x3x3xf32> + scf.yield %inserted_slice : tensor<64x64x3x3xf32> + } + scf.yield %4 : tensor<64x64x3x3xf32> + } + scf.yield %3 : tensor<64x64x3x3xf32> + } + scf.yield %2 : tensor<64x64x3x3xf32> + } return %1 : tensor<64x64x3x3xf32> } - func.func private @Unknown85(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown84(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf16>) outs(%0 : tensor<128x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x64x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x3x3xf32> + scf.yield %inserted_slice : tensor<128x64x3x3xf32> + } + scf.yield %4 : tensor<128x64x3x3xf32> + } + scf.yield %3 : tensor<128x64x3x3xf32> + } + scf.yield %2 : tensor<128x64x3x3xf32> + } return %1 : tensor<128x64x3x3xf32> } - func.func private @Unknown86(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown85(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x128x3x3xf32> + scf.yield %inserted_slice : tensor<128x128x3x3xf32> + } + scf.yield %4 : tensor<128x128x3x3xf32> + } + scf.yield %3 : tensor<128x128x3x3xf32> + } + scf.yield %2 : tensor<128x128x3x3xf32> + } return %1 : tensor<128x128x3x3xf32> } - func.func private @Unknown87(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown86(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf16>) outs(%0 : tensor<128x64x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x64x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x1x1xf32> + scf.yield %inserted_slice : tensor<128x64x1x1xf32> + } + scf.yield %2 : tensor<128x64x1x1xf32> + } return %1 : tensor<128x64x1x1xf32> } - func.func private @Unknown88(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> - return %1 : tensor<128x128x3x3xf32> - } - func.func private @Unknown89(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> - return %1 : tensor<128x128x3x3xf32> - } - func.func private @Unknown90(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown89(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf16>) outs(%0 : tensor<256x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x128x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x3x3xf32> + scf.yield %inserted_slice : tensor<256x128x3x3xf32> + } + scf.yield %4 : tensor<256x128x3x3xf32> + } + scf.yield %3 : tensor<256x128x3x3xf32> + } + scf.yield %2 : tensor<256x128x3x3xf32> + } return %1 : tensor<256x128x3x3xf32> } - func.func private @Unknown91(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown90(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x256x3x3xf32> + scf.yield %inserted_slice : tensor<256x256x3x3xf32> + } + scf.yield %4 : tensor<256x256x3x3xf32> + } + scf.yield %3 : tensor<256x256x3x3xf32> + } + scf.yield %2 : tensor<256x256x3x3xf32> + } return %1 : tensor<256x256x3x3xf32> } - func.func private @Unknown92(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown91(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf16>) outs(%0 : tensor<256x128x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x128x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x1x1xf32> + scf.yield %inserted_slice : tensor<256x128x1x1xf32> + } + scf.yield %2 : tensor<256x128x1x1xf32> + } return %1 : tensor<256x128x1x1xf32> } - func.func private @Unknown93(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> - return %1 : tensor<256x256x3x3xf32> - } - func.func private @Unknown94(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> - return %1 : tensor<256x256x3x3xf32> - } - func.func private @Unknown95(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown94(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf16>) outs(%0 : tensor<512x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x256x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x3x3xf32> + scf.yield %inserted_slice : tensor<512x256x3x3xf32> + } + scf.yield %4 : tensor<512x256x3x3xf32> + } + scf.yield %3 : tensor<512x256x3x3xf32> + } + scf.yield %2 : tensor<512x256x3x3xf32> + } return %1 : tensor<512x256x3x3xf32> } - func.func private @Unknown96(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown95(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x512x3x3xf32> + scf.yield %inserted_slice : tensor<512x512x3x3xf32> + } + scf.yield %4 : tensor<512x512x3x3xf32> + } + scf.yield %3 : tensor<512x512x3x3xf32> + } + scf.yield %2 : tensor<512x512x3x3xf32> + } return %1 : tensor<512x512x3x3xf32> } - func.func private @Unknown97(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown96(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf16>) outs(%0 : tensor<512x256x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x256x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x1x1xf32> + scf.yield %inserted_slice : tensor<512x256x1x1xf32> + } + scf.yield %2 : tensor<512x256x1x1xf32> + } return %1 : tensor<512x256x1x1xf32> } - func.func private @Unknown98(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> - return %1 : tensor<512x512x3x3xf32> - } - func.func private @Unknown99(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> - return %1 : tensor<512x512x3x3xf32> - } func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<128xf32>, %arg11: tensor<128xf32>, %arg12: tensor<128xf32>, %arg13: tensor<128xf32>, %arg14: tensor<128xf32>, %arg15: tensor<128xf32>, %arg16: tensor<128xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<256xf32>, %arg21: tensor<256xf32>, %arg22: tensor<256xf32>, %arg23: tensor<256xf32>, %arg24: tensor<256xf32>, %arg25: tensor<256xf32>, %arg26: tensor<256xf32>, %arg27: tensor<256xf32>, %arg28: tensor<256xf32>, %arg29: tensor<256xf32>, %arg30: tensor<512xf32>, %arg31: tensor<512xf32>, %arg32: tensor<512xf32>, %arg33: tensor<512xf32>, %arg34: tensor<512xf32>, %arg35: tensor<512xf32>, %arg36: tensor<512xf32>, %arg37: tensor<512xf32>, %arg38: tensor<512xf32>, %arg39: tensor<512xf32>, %arg40: tensor<64xf32>, %arg41: tensor<64xf32>, %arg42: tensor<64xf32>, %arg43: tensor<64xf32>, %arg44: tensor<64xf32>, %arg45: tensor<64xf32>, %arg46: tensor<64xf32>, %arg47: tensor<64xf32>, %arg48: tensor<64xf32>, %arg49: tensor<64xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<128xf32>, %arg53: tensor<128xf32>, %arg54: tensor<128xf32>, %arg55: tensor<128xf32>, %arg56: tensor<128xf32>, %arg57: tensor<128xf32>, %arg58: tensor<128xf32>, %arg59: tensor<128xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<512xf32>, %arg71: tensor<512xf32>, %arg72: tensor<512xf32>, %arg73: tensor<512xf32>, %arg74: tensor<512xf32>, %arg75: tensor<512xf32>, %arg76: tensor<512xf32>, %arg77: tensor<512xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<64x3x7x7xf16>, %arg81: tensor<1x3x224x224xf16>, %arg82: tensor<1x64x112x112xf16>, %arg83: tensor<1x64x112x112xf16>, %arg84: tensor<1x64x56x56xf16>, %arg85: tensor<64x64x3x3xf16>, %arg86: tensor<1x64x56x56xf16>, %arg87: tensor<1x64x56x56xf16>, %arg88: tensor<64x64x3x3xf16>, %arg89: tensor<1x64x56x56xf16>, %arg90: tensor<1x64x56x56xf16>, %arg91: tensor<64x64x3x3xf16>, %arg92: tensor<1x64x56x56xf16>, %arg93: tensor<1x64x56x56xf16>, %arg94: tensor<64x64x3x3xf16>, %arg95: tensor<1x64x56x56xf16>, %arg96: tensor<1x64x56x56xf16>, %arg97: tensor<128x64x3x3xf16>, %arg98: tensor<1x128x28x28xf16>, %arg99: tensor<1x128x28x28xf16>, %arg100: tensor<128x128x3x3xf16>, %arg101: tensor<1x128x28x28xf16>, %arg102: tensor<128x64x1x1xf16>, %arg103: tensor<1x128x28x28xf16>, %arg104: tensor<1x128x28x28xf16>, %arg105: tensor<128x128x3x3xf16>, %arg106: tensor<1x128x28x28xf16>, %arg107: tensor<1x128x28x28xf16>, %arg108: tensor<128x128x3x3xf16>, %arg109: tensor<1x128x28x28xf16>, %arg110: tensor<1x128x28x28xf16>, %arg111: tensor<256x128x3x3xf16>, %arg112: tensor<1x256x14x14xf16>, %arg113: tensor<1x256x14x14xf16>, %arg114: tensor<256x256x3x3xf16>, %arg115: tensor<1x256x14x14xf16>, %arg116: tensor<256x128x1x1xf16>, %arg117: tensor<1x256x14x14xf16>, %arg118: tensor<1x256x14x14xf16>, %arg119: tensor<256x256x3x3xf16>, %arg120: tensor<1x256x14x14xf16>, %arg121: tensor<1x256x14x14xf16>, %arg122: tensor<256x256x3x3xf16>, %arg123: tensor<1x256x14x14xf16>, %arg124: tensor<1x256x14x14xf16>, %arg125: tensor<512x256x3x3xf16>, %arg126: tensor<1x512x7x7xf16>, %arg127: tensor<1x512x7x7xf16>, %arg128: tensor<512x512x3x3xf16>, %arg129: tensor<1x512x7x7xf16>, %arg130: tensor<512x256x1x1xf16>, %arg131: tensor<1x512x7x7xf16>, %arg132: tensor<1x512x7x7xf16>, %arg133: tensor<512x512x3x3xf16>, %arg134: tensor<1x512x7x7xf16>, %arg135: tensor<1x512x7x7xf16>, %arg136: tensor<512x512x3x3xf16>, %arg137: tensor<1x512x7x7xf16>, %arg138: tensor<1x512x7x7xf16>, %arg139: tensor<1x512xf16>, %arg140: tensor<512x1000xf16>, %arg141: tensor<1x1000xf16>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>) { - %0 = mhlo.constant dense<0.000000e+00> : tensor - %1 = mhlo.constant dense<0.000000e+00> : tensor - %2 = "mhlo.dot_general"(%arg141, %arg140) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<1x1000xf16>, tensor<512x1000xf16>) -> tensor<1x512xf16> - %3 = call @Unknown0(%2, %arg138) : (tensor<1x512xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %4:3 = call @BatchNormGradOp1(%arg137, %arg39, %3) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %5 = call @ConvBackwardDataOp2(%4#0, %arg136) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %6 = call @ConvBackwardFilterOp3(%arg135, %4#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %7 = call @Unknown4(%arg135, %5) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %8:3 = call @BatchNormGradOp5(%arg134, %arg37, %7) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %9 = call @ConvBackwardDataOp6(%8#0, %arg133) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %10 = call @ConvBackwardFilterOp7(%arg132, %8#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %11 = call @Unknown8(%3, %9, %arg132) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %12:3 = call @BatchNormGradOp9(%arg129, %arg33, %11) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %13 = call @ConvBackwardDataOp10(%12#0, %arg128) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %14 = call @ConvBackwardFilterOp11(%arg127, %12#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %15 = call @Unknown12(%arg127, %13) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %16:3 = call @BatchNormGradOp13(%arg126, %arg31, %15) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %17 = call @ConvBackwardDataOp14(%16#0, %arg125) : (tensor<1x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %18 = call @ConvBackwardFilterOp15(%arg124, %16#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x3x3xf16> - %19:3 = call @BatchNormGradOp16(%arg131, %arg35, %11) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %20 = call @ConvBackwardDataOp17(%19#0, %arg130) : (tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16> - %21 = call @ConvBackwardFilterOp18(%arg124, %19#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x1x1xf16> - %22 = call @Unknown19(%20, %17, %arg124) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %23:3 = call @BatchNormGradOp20(%arg123, %arg29, %22) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %24 = call @ConvBackwardDataOp21(%23#0, %arg122) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %25 = call @ConvBackwardFilterOp22(%arg121, %23#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %26 = call @Unknown23(%arg121, %24) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %27:3 = call @BatchNormGradOp24(%arg120, %arg27, %26) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %28 = call @ConvBackwardDataOp25(%27#0, %arg119) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %29 = call @ConvBackwardFilterOp26(%arg118, %27#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %30 = call @Unknown27(%22, %28, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %31:3 = call @BatchNormGradOp28(%arg115, %arg23, %30) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %32 = call @ConvBackwardDataOp29(%31#0, %arg114) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %33 = call @ConvBackwardFilterOp30(%arg113, %31#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %34 = call @Unknown31(%arg113, %32) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %35:3 = call @BatchNormGradOp32(%arg112, %arg21, %34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %36 = call @ConvBackwardDataOp33(%35#0, %arg111) : (tensor<1x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %37 = call @ConvBackwardFilterOp34(%arg110, %35#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x3x3xf16> - %38:3 = call @BatchNormGradOp35(%arg117, %arg25, %30) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %39 = call @ConvBackwardDataOp36(%38#0, %arg116) : (tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16> - %40 = call @ConvBackwardFilterOp37(%arg110, %38#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x1x1xf16> - %41 = call @Unknown38(%39, %36, %arg110) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %42:3 = call @BatchNormGradOp39(%arg109, %arg19, %41) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %43 = call @ConvBackwardDataOp40(%42#0, %arg108) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %44 = call @ConvBackwardFilterOp41(%arg107, %42#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %45 = call @Unknown42(%arg107, %43) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %46:3 = call @BatchNormGradOp43(%arg106, %arg17, %45) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %47 = call @ConvBackwardDataOp44(%46#0, %arg105) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %48 = call @ConvBackwardFilterOp45(%arg104, %46#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %49 = call @Unknown46(%41, %47, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %50:3 = call @BatchNormGradOp47(%arg101, %arg13, %49) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %51 = call @ConvBackwardDataOp48(%50#0, %arg100) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %52 = call @ConvBackwardFilterOp49(%arg99, %50#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %53 = call @Unknown50(%arg99, %51) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %54:3 = call @BatchNormGradOp51(%arg98, %arg11, %53) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %55 = call @ConvBackwardDataOp52(%54#0, %arg97) : (tensor<1x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %56 = call @ConvBackwardFilterOp53(%arg96, %54#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x3x3xf16> - %57:3 = call @BatchNormGradOp54(%arg103, %arg15, %49) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %58 = call @ConvBackwardDataOp55(%57#0, %arg102) : (tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16> - %59 = call @ConvBackwardFilterOp56(%arg96, %57#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x1x1xf16> - %60 = call @Unknown57(%58, %55, %arg96) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %61:3 = call @BatchNormGradOp58(%arg95, %arg9, %60) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %62 = call @ConvBackwardDataOp59(%61#0, %arg94) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %63 = call @ConvBackwardFilterOp60(%arg93, %61#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %64 = call @Unknown61(%arg93, %62) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %65:3 = call @BatchNormGradOp62(%arg92, %arg7, %64) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %66 = call @ConvBackwardDataOp63(%65#0, %arg91) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %67 = call @ConvBackwardFilterOp64(%arg90, %65#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %68 = call @Unknown65(%60, %66, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %69:3 = call @BatchNormGradOp66(%arg89, %arg5, %68) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %70 = call @ConvBackwardDataOp67(%69#0, %arg88) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %71 = call @ConvBackwardFilterOp68(%arg87, %69#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %72 = call @Unknown69(%arg87, %70) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %73:3 = call @BatchNormGradOp70(%arg86, %arg3, %72) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %74 = call @ConvBackwardDataOp71(%73#0, %arg85) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %75 = call @ConvBackwardFilterOp72(%arg84, %73#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %76 = call @Unknown73(%68, %74) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %77 = "mhlo.select_and_scatter"(%arg83, %76, %1) ({ + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = "mhlo.dot_general"(%arg141, %arg140) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<1x1000xf16>, tensor<512x1000xf16>) -> tensor<1x512xf16> + %2 = call @Unknown0(%1, %arg138) : (tensor<1x512xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %3:3 = call @BatchNormGradOp1(%arg137, %arg39, %2) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %4 = call @ConvBackwardDataOp2(%3#0, %arg136) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %5 = call @ConvBackwardFilterOp3(%arg135, %3#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %6 = call @Unknown4(%arg135, %4) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %7:3 = call @BatchNormGradOp1(%arg134, %arg37, %6) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %8 = call @ConvBackwardDataOp2(%7#0, %arg133) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %9 = call @ConvBackwardFilterOp3(%arg132, %7#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %10 = call @Unknown8(%2, %8, %arg132) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %11:3 = call @BatchNormGradOp1(%arg129, %arg33, %10) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %12 = call @ConvBackwardDataOp2(%11#0, %arg128) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %13 = call @ConvBackwardFilterOp3(%arg127, %11#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %14 = call @Unknown4(%arg127, %12) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %15:3 = call @BatchNormGradOp1(%arg126, %arg31, %14) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %16 = call @ConvBackwardDataOp14(%15#0, %arg125) : (tensor<1x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %17 = call @ConvBackwardFilterOp15(%arg124, %15#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x3x3xf16> + %18:3 = call @BatchNormGradOp1(%arg131, %arg35, %10) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %19 = call @ConvBackwardDataOp17(%18#0, %arg130) : (tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16> + %20 = call @ConvBackwardFilterOp18(%arg124, %18#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x1x1xf16> + %21 = call @Unknown19(%19, %16, %arg124) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %22:3 = call @BatchNormGradOp20(%arg123, %arg29, %21) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %23 = call @ConvBackwardDataOp21(%22#0, %arg122) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %24 = call @ConvBackwardFilterOp22(%arg121, %22#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %25 = call @Unknown23(%arg121, %23) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %26:3 = call @BatchNormGradOp20(%arg120, %arg27, %25) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %27 = call @ConvBackwardDataOp21(%26#0, %arg119) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %28 = call @ConvBackwardFilterOp22(%arg118, %26#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %29 = call @Unknown19(%21, %27, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %30:3 = call @BatchNormGradOp20(%arg115, %arg23, %29) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %31 = call @ConvBackwardDataOp21(%30#0, %arg114) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %32 = call @ConvBackwardFilterOp22(%arg113, %30#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %33 = call @Unknown23(%arg113, %31) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %34:3 = call @BatchNormGradOp20(%arg112, %arg21, %33) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %35 = call @ConvBackwardDataOp33(%34#0, %arg111) : (tensor<1x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %36 = call @ConvBackwardFilterOp34(%arg110, %34#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x3x3xf16> + %37:3 = call @BatchNormGradOp20(%arg117, %arg25, %29) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %38 = call @ConvBackwardDataOp36(%37#0, %arg116) : (tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16> + %39 = call @ConvBackwardFilterOp37(%arg110, %37#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x1x1xf16> + %40 = call @Unknown38(%38, %35, %arg110) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %41:3 = call @BatchNormGradOp39(%arg109, %arg19, %40) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %42 = call @ConvBackwardDataOp40(%41#0, %arg108) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %43 = call @ConvBackwardFilterOp41(%arg107, %41#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %44 = call @Unknown42(%arg107, %42) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %45:3 = call @BatchNormGradOp39(%arg106, %arg17, %44) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %46 = call @ConvBackwardDataOp40(%45#0, %arg105) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %47 = call @ConvBackwardFilterOp41(%arg104, %45#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %48 = call @Unknown38(%40, %46, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %49:3 = call @BatchNormGradOp39(%arg101, %arg13, %48) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %50 = call @ConvBackwardDataOp40(%49#0, %arg100) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %51 = call @ConvBackwardFilterOp41(%arg99, %49#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %52 = call @Unknown42(%arg99, %50) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %53:3 = call @BatchNormGradOp39(%arg98, %arg11, %52) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %54 = call @ConvBackwardDataOp52(%53#0, %arg97) : (tensor<1x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %55 = call @ConvBackwardFilterOp53(%arg96, %53#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x3x3xf16> + %56:3 = call @BatchNormGradOp39(%arg103, %arg15, %48) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %57 = call @ConvBackwardDataOp55(%56#0, %arg102) : (tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16> + %58 = call @ConvBackwardFilterOp56(%arg96, %56#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x1x1xf16> + %59 = call @Unknown57(%57, %54, %arg96) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %60:3 = call @BatchNormGradOp58(%arg95, %arg9, %59) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %61 = call @ConvBackwardDataOp59(%60#0, %arg94) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %62 = call @ConvBackwardFilterOp60(%arg93, %60#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %63 = call @Unknown61(%arg93, %61) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %64:3 = call @BatchNormGradOp58(%arg92, %arg7, %63) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %65 = call @ConvBackwardDataOp59(%64#0, %arg91) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %66 = call @ConvBackwardFilterOp60(%arg90, %64#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %67 = call @Unknown57(%59, %65, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %68:3 = call @BatchNormGradOp58(%arg89, %arg5, %67) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %69 = call @ConvBackwardDataOp59(%68#0, %arg88) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %70 = call @ConvBackwardFilterOp60(%arg87, %68#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %71 = call @Unknown61(%arg87, %69) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %72:3 = call @BatchNormGradOp58(%arg86, %arg3, %71) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %73 = call @ConvBackwardDataOp59(%72#0, %arg85) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %74 = call @ConvBackwardFilterOp60(%arg84, %72#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %75 = call @Unknown73(%67, %73) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %76 = "mhlo.select_and_scatter"(%arg83, %75, %0) ({ ^bb0(%arg142: tensor, %arg143: tensor): - %107 = mhlo.compare GE, %arg142, %arg143 : (tensor, tensor) -> tensor - mhlo.return %107 : tensor + %104 = mhlo.compare GE, %arg142, %arg143 : (tensor, tensor) -> tensor + mhlo.return %104 : tensor }, { ^bb0(%arg142: tensor, %arg143: tensor): - %107 = mhlo.add %arg142, %arg143 : tensor - mhlo.return %107 : tensor + %104 = mhlo.add %arg142, %arg143 : tensor + mhlo.return %104 : tensor }) {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor) -> tensor<1x64x112x112xf16> - %78 = call @Unknown74(%arg83, %77) : (tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> - %79:3 = call @BatchNormGradOp75(%arg82, %arg1, %78) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) - %80 = call @ConvBackwardFilterOp76(%arg81, %79#0) : (tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>) -> tensor<64x3x7x7xf16> - %81 = call @Unknown77(%80) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> - %82 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1x1000xf32> - %83 = mhlo.reduce(%82 init: %0) across dimensions = [0] : (tensor<1x1000xf32>, tensor) -> tensor<1000xf32> - reducer(%arg142: tensor, %arg143: tensor) { - %107 = mhlo.add %arg142, %arg143 : tensor - mhlo.return %107 : tensor - } - %84 = call @Unknown79(%83) : (tensor<1000xf32>) -> tensor<1000xf32> - %85 = mhlo.reshape %arg141 : (tensor<1x1000xf16>) -> tensor<1000x1xf16> - %86 = "mhlo.dot"(%85, %arg139) {precision_config = [#mhlo, #mhlo]} : (tensor<1000x1xf16>, tensor<1x512xf16>) -> tensor<1000x512xf16> - %87 = call @Unknown80(%86) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> - %88 = call @Unknown81(%75) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %89 = call @Unknown82(%71) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %90 = call @Unknown83(%67) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %91 = call @Unknown84(%63) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %92 = call @Unknown85(%56) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> - %93 = call @Unknown86(%52) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %94 = call @Unknown87(%59) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> - %95 = call @Unknown88(%48) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %96 = call @Unknown89(%44) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %97 = call @Unknown90(%37) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> - %98 = call @Unknown91(%33) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %99 = call @Unknown92(%40) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> - %100 = call @Unknown93(%29) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %101 = call @Unknown94(%25) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %102 = call @Unknown95(%18) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> - %103 = call @Unknown96(%14) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %104 = call @Unknown97(%21) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> - %105 = call @Unknown98(%10) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %106 = call @Unknown99(%6) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - return %79#2, %79#1, %81, %84, %87, %73#2, %73#1, %69#2, %69#1, %88, %89, %65#2, %65#1, %61#2, %61#1, %90, %91, %54#2, %54#1, %50#2, %50#1, %92, %93, %94, %57#2, %57#1, %46#2, %46#1, %42#2, %42#1, %95, %96, %35#2, %35#1, %31#2, %31#1, %97, %98, %99, %38#2, %38#1, %27#2, %27#1, %23#2, %23#1, %100, %101, %16#2, %16#1, %12#2, %12#1, %102, %103, %104, %19#2, %19#1, %8#2, %8#1, %4#2, %4#1, %105, %106 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32> + %77 = call @Unknown74(%arg83, %76) : (tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> + %78:3 = call @BatchNormGradOp75(%arg82, %arg1, %77) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) + %79 = call @ConvBackwardFilterOp76(%arg81, %78#0) : (tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>) -> tensor<64x3x7x7xf16> + %80 = call @Unknown77(%79) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> + %81 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1000xf32> + %82 = mhlo.reshape %arg141 : (tensor<1x1000xf16>) -> tensor<1000x1xf16> + %83 = "mhlo.dot"(%82, %arg139) {precision_config = [#mhlo, #mhlo]} : (tensor<1000x1xf16>, tensor<1x512xf16>) -> tensor<1000x512xf16> + %84 = call @Unknown79(%83) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> + %85 = call @Unknown80(%74) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %86 = call @Unknown80(%70) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %87 = call @Unknown80(%66) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %88 = call @Unknown80(%62) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %89 = call @Unknown84(%55) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> + %90 = call @Unknown85(%51) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %91 = call @Unknown86(%58) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> + %92 = call @Unknown85(%47) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %93 = call @Unknown85(%43) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %94 = call @Unknown89(%36) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> + %95 = call @Unknown90(%32) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %96 = call @Unknown91(%39) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> + %97 = call @Unknown90(%28) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %98 = call @Unknown90(%24) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %99 = call @Unknown94(%17) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> + %100 = call @Unknown95(%13) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %101 = call @Unknown96(%20) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> + %102 = call @Unknown95(%9) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %103 = call @Unknown95(%5) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + return %78#2, %78#1, %80, %81, %84, %72#2, %72#1, %68#2, %68#1, %85, %86, %64#2, %64#1, %60#2, %60#1, %87, %88, %53#2, %53#1, %49#2, %49#1, %89, %90, %91, %56#2, %56#1, %45#2, %45#1, %41#2, %41#1, %92, %93, %34#2, %34#1, %30#2, %30#1, %94, %95, %96, %37#2, %37#1, %26#2, %26#1, %22#2, %22#1, %97, %98, %15#2, %15#1, %11#2, %11#1, %99, %100, %101, %18#2, %18#1, %7#2, %7#1, %3#2, %3#1, %102, %103 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/4_bufferize_opt.mlir b/compiler/test/E2E/ResNet18/BW/4_bufferize_opt.mlir index 2db090757..9a8d1f4c0 100644 --- a/compiler/test/E2E/ResNet18/BW/4_bufferize_opt.mlir +++ b/compiler/test/E2E/ResNet18/BW/4_bufferize_opt.mlir @@ -2,424 +2,661 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -#map2 = affine_map<(d0, d1) -> (d0, d1)> -#map3 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> module { func.func private @Unknown0(%arg0: tensor<1x512xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %cst_0 = arith.constant 4.900000e+01 : f16 %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : tensor<1x512x7x7xf16>, tensor<1x512xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_1: f16, %out: f16): - %2 = arith.divf %in_1, %cst_0 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x512x7x7xf16> + %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) { + %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) { + %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2] [1, 1] [1, 1] : tensor<1x512xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %6 = arith.divf %in, %cst_0 : f16 + %7 = arith.cmpf ogt, %in_2, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x512x7x7xf16> + scf.yield %inserted_slice : tensor<1x512x7x7xf16> + } + scf.yield %3 : tensor<1x512x7x7xf16> + } + scf.yield %2 : tensor<1x512x7x7xf16> + } return %1 : tensor<1x512x7x7xf16> } func.func private @Unknown4(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x512x7x7xf16> + %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) { + %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) { + %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.cmpf ogt, %in, %cst : f16 + %7 = arith.select %6, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x512x7x7xf16> + scf.yield %inserted_slice : tensor<1x512x7x7xf16> + } + scf.yield %3 : tensor<1x512x7x7xf16> + } + scf.yield %2 : tensor<1x512x7x7xf16> + } return %1 : tensor<1x512x7x7xf16> } func.func private @Unknown8(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>, %arg2: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x512x7x7xf16> - return %1 : tensor<1x512x7x7xf16> - } - func.func private @Unknown12(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x512x7x7xf16> + %1 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %0) -> (tensor<1x512x7x7xf16>) { + %2 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x512x7x7xf16>) { + %3 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16): + %6 = arith.addf %in, %in_2 : f16 + %7 = arith.cmpf ogt, %in_3, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x512x7x7xf16> + scf.yield %inserted_slice : tensor<1x512x7x7xf16> + } + scf.yield %3 : tensor<1x512x7x7xf16> + } + scf.yield %2 : tensor<1x512x7x7xf16> + } return %1 : tensor<1x512x7x7xf16> } func.func private @Unknown19(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>, %arg2: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x256x14x14xf16> + %1 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %0) -> (tensor<1x256x14x14xf16>) { + %2 = scf.for %arg5 = %c0 to %c14 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x256x14x14xf16>) { + %3 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16): + %6 = arith.addf %in, %in_2 : f16 + %7 = arith.cmpf ogt, %in_3, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x256x14x14xf16> + scf.yield %inserted_slice : tensor<1x256x14x14xf16> + } + scf.yield %3 : tensor<1x256x14x14xf16> + } + scf.yield %2 : tensor<1x256x14x14xf16> + } return %1 : tensor<1x256x14x14xf16> } func.func private @Unknown23(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x256x14x14xf16> - return %1 : tensor<1x256x14x14xf16> - } - func.func private @Unknown27(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>, %arg2: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x256x14x14xf16> - return %1 : tensor<1x256x14x14xf16> - } - func.func private @Unknown31(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x256x14x14xf16> + %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<1x256x14x14xf16>) { + %2 = scf.for %arg4 = %c0 to %c14 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x256x14x14xf16>) { + %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.cmpf ogt, %in, %cst : f16 + %7 = arith.select %6, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x256x14x14xf16> + scf.yield %inserted_slice : tensor<1x256x14x14xf16> + } + scf.yield %3 : tensor<1x256x14x14xf16> + } + scf.yield %2 : tensor<1x256x14x14xf16> + } return %1 : tensor<1x256x14x14xf16> } func.func private @Unknown38(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>, %arg2: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x128x28x28xf16> + %1 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %0) -> (tensor<1x128x28x28xf16>) { + %2 = scf.for %arg5 = %c0 to %c28 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x128x28x28xf16>) { + %3 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16): + %6 = arith.addf %in, %in_2 : f16 + %7 = arith.cmpf ogt, %in_3, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x128x28x28xf16> + scf.yield %inserted_slice : tensor<1x128x28x28xf16> + } + scf.yield %3 : tensor<1x128x28x28xf16> + } + scf.yield %2 : tensor<1x128x28x28xf16> + } return %1 : tensor<1x128x28x28xf16> } func.func private @Unknown42(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x128x28x28xf16> - return %1 : tensor<1x128x28x28xf16> - } - func.func private @Unknown46(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>, %arg2: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x128x28x28xf16> - return %1 : tensor<1x128x28x28xf16> - } - func.func private @Unknown50(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x128x28x28xf16> + %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<1x128x28x28xf16>) { + %2 = scf.for %arg4 = %c0 to %c28 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x128x28x28xf16>) { + %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.cmpf ogt, %in, %cst : f16 + %7 = arith.select %6, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x128x28x28xf16> + scf.yield %inserted_slice : tensor<1x128x28x28xf16> + } + scf.yield %3 : tensor<1x128x28x28xf16> + } + scf.yield %2 : tensor<1x128x28x28xf16> + } return %1 : tensor<1x128x28x28xf16> } func.func private @Unknown57(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>, %arg2: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x64x56x56xf16> + %1 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %0) -> (tensor<1x64x56x56xf16>) { + %2 = scf.for %arg5 = %c0 to %c56 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x56x56xf16>) { + %3 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16): + %6 = arith.addf %in, %in_2 : f16 + %7 = arith.cmpf ogt, %in_3, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x56x56xf16> + scf.yield %inserted_slice : tensor<1x64x56x56xf16> + } + scf.yield %3 : tensor<1x64x56x56xf16> + } + scf.yield %2 : tensor<1x64x56x56xf16> + } return %1 : tensor<1x64x56x56xf16> } func.func private @Unknown61(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x64x56x56xf16> - return %1 : tensor<1x64x56x56xf16> - } - func.func private @Unknown65(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>, %arg2: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.cmpf ogt, %in, %cst : f16 - %4 = arith.select %3, %2, %cst : f16 - linalg.yield %4 : f16 - } -> tensor<1x64x56x56xf16> - return %1 : tensor<1x64x56x56xf16> - } - func.func private @Unknown69(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x64x56x56xf16> + %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) { + %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) { + %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.cmpf ogt, %in, %cst : f16 + %7 = arith.select %6, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x56x56xf16> + scf.yield %inserted_slice : tensor<1x64x56x56xf16> + } + scf.yield %3 : tensor<1x64x56x56xf16> + } + scf.yield %2 : tensor<1x64x56x56xf16> + } return %1 : tensor<1x64x56x56xf16> } func.func private @Unknown73(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - linalg.yield %2 : f16 - } -> tensor<1x64x56x56xf16> + %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) { + %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) { + %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.addf %in, %in_1 : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x56x56xf16> + scf.yield %inserted_slice : tensor<1x64x56x56xf16> + } + scf.yield %3 : tensor<1x64x56x56xf16> + } + scf.yield %2 : tensor<1x64x56x56xf16> + } return %1 : tensor<1x64x56x56xf16> } func.func private @Unknown74(%arg0: tensor<1x64x112x112xf16>, %arg1: tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + %c112 = arith.constant 112 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x112x112xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) outs(%0 : tensor<1x64x112x112xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.cmpf ogt, %in, %cst : f16 - %3 = arith.select %2, %in_0, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x64x112x112xf16> + %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x112x112xf16>) { + %2 = scf.for %arg4 = %c0 to %c112 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x112x112xf16>) { + %3 = scf.for %arg6 = %c0 to %c112 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x112x112xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.cmpf ogt, %in, %cst : f16 + %7 = arith.select %6, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x112x112xf16> + scf.yield %inserted_slice : tensor<1x64x112x112xf16> + } + scf.yield %3 : tensor<1x64x112x112xf16> + } + scf.yield %2 : tensor<1x64x112x112xf16> + } return %1 : tensor<1x64x112x112xf16> } func.func private @Unknown77(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x3x7x7xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf16>) outs(%0 : tensor<64x3x7x7xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x3x7x7xf32> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf32>) { + %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf32>) { + %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf32>) { + %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x3x7x7xf32> + scf.yield %inserted_slice : tensor<64x3x7x7xf32> + } + scf.yield %4 : tensor<64x3x7x7xf32> + } + scf.yield %3 : tensor<64x3x7x7xf32> + } + scf.yield %2 : tensor<64x3x7x7xf32> + } return %1 : tensor<64x3x7x7xf32> } - func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1x1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1x1000xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x1000xf16>) outs(%0 : tensor<1x1000xf32>) { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<1x1000xf32> - return %1 : tensor<1x1000xf32> - } - func.func private @Unknown79(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<1000xf32> - %1 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%arg0 : tensor<1000xf32>) outs(%0 : tensor<1000xf32>) { - ^bb0(%in: f32, %out: f32): - %2 = arith.truncf %in : f32 to f16 - %3 = arith.extf %2 : f16 to f32 - linalg.yield %3 : f32 - } -> tensor<1000xf32> - return %1 : tensor<1000xf32> - } - func.func private @Unknown80(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1x1000xf32>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1] [1, 1] [1, 1] : tensor<1x1000xf16> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%2 : tensor) { + ^bb0(%in: f16, %out: f32): + %4 = arith.extf %in : f16 to f32 + %5 = arith.truncf %4 : f32 to f16 + %6 = arith.extf %5 : f16 to f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg2[0, %arg1] [1, 1] [1, 1] : tensor into tensor<1x1000xf32> + scf.yield %inserted_slice : tensor<1x1000xf32> + } + %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x1000xf32> into tensor<1000xf32> + return %collapsed : tensor<1000xf32> + } + func.func private @Unknown79(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1000x512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf16>) outs(%0 : tensor<1000x512xf32>) { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<1000x512xf32> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf32>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<1000x512xf32> + scf.yield %inserted_slice : tensor<1000x512xf32> + } + scf.yield %2 : tensor<1000x512xf32> + } return %1 : tensor<1000x512xf32> } - func.func private @Unknown81(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown82(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown80(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x64x3x3xf32> + scf.yield %inserted_slice : tensor<64x64x3x3xf32> + } + scf.yield %4 : tensor<64x64x3x3xf32> + } + scf.yield %3 : tensor<64x64x3x3xf32> + } + scf.yield %2 : tensor<64x64x3x3xf32> + } return %1 : tensor<64x64x3x3xf32> } - func.func private @Unknown83(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown84(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown85(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown84(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf16>) outs(%0 : tensor<128x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x64x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x3x3xf32> + scf.yield %inserted_slice : tensor<128x64x3x3xf32> + } + scf.yield %4 : tensor<128x64x3x3xf32> + } + scf.yield %3 : tensor<128x64x3x3xf32> + } + scf.yield %2 : tensor<128x64x3x3xf32> + } return %1 : tensor<128x64x3x3xf32> } - func.func private @Unknown86(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown85(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x128x3x3xf32> + scf.yield %inserted_slice : tensor<128x128x3x3xf32> + } + scf.yield %4 : tensor<128x128x3x3xf32> + } + scf.yield %3 : tensor<128x128x3x3xf32> + } + scf.yield %2 : tensor<128x128x3x3xf32> + } return %1 : tensor<128x128x3x3xf32> } - func.func private @Unknown87(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown86(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf16>) outs(%0 : tensor<128x64x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x64x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x1x1xf32> + scf.yield %inserted_slice : tensor<128x64x1x1xf32> + } + scf.yield %2 : tensor<128x64x1x1xf32> + } return %1 : tensor<128x64x1x1xf32> } - func.func private @Unknown88(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> - return %1 : tensor<128x128x3x3xf32> - } - func.func private @Unknown89(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> - return %1 : tensor<128x128x3x3xf32> - } - func.func private @Unknown90(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown89(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf16>) outs(%0 : tensor<256x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x128x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x3x3xf32> + scf.yield %inserted_slice : tensor<256x128x3x3xf32> + } + scf.yield %4 : tensor<256x128x3x3xf32> + } + scf.yield %3 : tensor<256x128x3x3xf32> + } + scf.yield %2 : tensor<256x128x3x3xf32> + } return %1 : tensor<256x128x3x3xf32> } - func.func private @Unknown91(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown90(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x256x3x3xf32> + scf.yield %inserted_slice : tensor<256x256x3x3xf32> + } + scf.yield %4 : tensor<256x256x3x3xf32> + } + scf.yield %3 : tensor<256x256x3x3xf32> + } + scf.yield %2 : tensor<256x256x3x3xf32> + } return %1 : tensor<256x256x3x3xf32> } - func.func private @Unknown92(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown91(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf16>) outs(%0 : tensor<256x128x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x128x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x1x1xf32> + scf.yield %inserted_slice : tensor<256x128x1x1xf32> + } + scf.yield %2 : tensor<256x128x1x1xf32> + } return %1 : tensor<256x128x1x1xf32> } - func.func private @Unknown93(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> - return %1 : tensor<256x256x3x3xf32> - } - func.func private @Unknown94(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> - return %1 : tensor<256x256x3x3xf32> - } - func.func private @Unknown95(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown94(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf16>) outs(%0 : tensor<512x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x256x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x3x3xf32> + scf.yield %inserted_slice : tensor<512x256x3x3xf32> + } + scf.yield %4 : tensor<512x256x3x3xf32> + } + scf.yield %3 : tensor<512x256x3x3xf32> + } + scf.yield %2 : tensor<512x256x3x3xf32> + } return %1 : tensor<512x256x3x3xf32> } - func.func private @Unknown96(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown95(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x512x3x3xf32> + scf.yield %inserted_slice : tensor<512x512x3x3xf32> + } + scf.yield %4 : tensor<512x512x3x3xf32> + } + scf.yield %3 : tensor<512x512x3x3xf32> + } + scf.yield %2 : tensor<512x512x3x3xf32> + } return %1 : tensor<512x512x3x3xf32> } - func.func private @Unknown97(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown96(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf16>) outs(%0 : tensor<512x256x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x256x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x1x1xf32> + scf.yield %inserted_slice : tensor<512x256x1x1xf32> + } + scf.yield %2 : tensor<512x256x1x1xf32> + } return %1 : tensor<512x256x1x1xf32> } - func.func private @Unknown98(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> - return %1 : tensor<512x512x3x3xf32> - } - func.func private @Unknown99(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> - return %1 : tensor<512x512x3x3xf32> - } func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<128xf32>, %arg11: tensor<128xf32>, %arg12: tensor<128xf32>, %arg13: tensor<128xf32>, %arg14: tensor<128xf32>, %arg15: tensor<128xf32>, %arg16: tensor<128xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<256xf32>, %arg21: tensor<256xf32>, %arg22: tensor<256xf32>, %arg23: tensor<256xf32>, %arg24: tensor<256xf32>, %arg25: tensor<256xf32>, %arg26: tensor<256xf32>, %arg27: tensor<256xf32>, %arg28: tensor<256xf32>, %arg29: tensor<256xf32>, %arg30: tensor<512xf32>, %arg31: tensor<512xf32>, %arg32: tensor<512xf32>, %arg33: tensor<512xf32>, %arg34: tensor<512xf32>, %arg35: tensor<512xf32>, %arg36: tensor<512xf32>, %arg37: tensor<512xf32>, %arg38: tensor<512xf32>, %arg39: tensor<512xf32>, %arg40: tensor<64xf32>, %arg41: tensor<64xf32>, %arg42: tensor<64xf32>, %arg43: tensor<64xf32>, %arg44: tensor<64xf32>, %arg45: tensor<64xf32>, %arg46: tensor<64xf32>, %arg47: tensor<64xf32>, %arg48: tensor<64xf32>, %arg49: tensor<64xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<128xf32>, %arg53: tensor<128xf32>, %arg54: tensor<128xf32>, %arg55: tensor<128xf32>, %arg56: tensor<128xf32>, %arg57: tensor<128xf32>, %arg58: tensor<128xf32>, %arg59: tensor<128xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<512xf32>, %arg71: tensor<512xf32>, %arg72: tensor<512xf32>, %arg73: tensor<512xf32>, %arg74: tensor<512xf32>, %arg75: tensor<512xf32>, %arg76: tensor<512xf32>, %arg77: tensor<512xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<64x3x7x7xf16>, %arg81: tensor<1x3x224x224xf16>, %arg82: tensor<1x64x112x112xf16>, %arg83: tensor<1x64x112x112xf16>, %arg84: tensor<1x64x56x56xf16>, %arg85: tensor<64x64x3x3xf16>, %arg86: tensor<1x64x56x56xf16>, %arg87: tensor<1x64x56x56xf16>, %arg88: tensor<64x64x3x3xf16>, %arg89: tensor<1x64x56x56xf16>, %arg90: tensor<1x64x56x56xf16>, %arg91: tensor<64x64x3x3xf16>, %arg92: tensor<1x64x56x56xf16>, %arg93: tensor<1x64x56x56xf16>, %arg94: tensor<64x64x3x3xf16>, %arg95: tensor<1x64x56x56xf16>, %arg96: tensor<1x64x56x56xf16>, %arg97: tensor<128x64x3x3xf16>, %arg98: tensor<1x128x28x28xf16>, %arg99: tensor<1x128x28x28xf16>, %arg100: tensor<128x128x3x3xf16>, %arg101: tensor<1x128x28x28xf16>, %arg102: tensor<128x64x1x1xf16>, %arg103: tensor<1x128x28x28xf16>, %arg104: tensor<1x128x28x28xf16>, %arg105: tensor<128x128x3x3xf16>, %arg106: tensor<1x128x28x28xf16>, %arg107: tensor<1x128x28x28xf16>, %arg108: tensor<128x128x3x3xf16>, %arg109: tensor<1x128x28x28xf16>, %arg110: tensor<1x128x28x28xf16>, %arg111: tensor<256x128x3x3xf16>, %arg112: tensor<1x256x14x14xf16>, %arg113: tensor<1x256x14x14xf16>, %arg114: tensor<256x256x3x3xf16>, %arg115: tensor<1x256x14x14xf16>, %arg116: tensor<256x128x1x1xf16>, %arg117: tensor<1x256x14x14xf16>, %arg118: tensor<1x256x14x14xf16>, %arg119: tensor<256x256x3x3xf16>, %arg120: tensor<1x256x14x14xf16>, %arg121: tensor<1x256x14x14xf16>, %arg122: tensor<256x256x3x3xf16>, %arg123: tensor<1x256x14x14xf16>, %arg124: tensor<1x256x14x14xf16>, %arg125: tensor<512x256x3x3xf16>, %arg126: tensor<1x512x7x7xf16>, %arg127: tensor<1x512x7x7xf16>, %arg128: tensor<512x512x3x3xf16>, %arg129: tensor<1x512x7x7xf16>, %arg130: tensor<512x256x1x1xf16>, %arg131: tensor<1x512x7x7xf16>, %arg132: tensor<1x512x7x7xf16>, %arg133: tensor<512x512x3x3xf16>, %arg134: tensor<1x512x7x7xf16>, %arg135: tensor<1x512x7x7xf16>, %arg136: tensor<512x512x3x3xf16>, %arg137: tensor<1x512x7x7xf16>, %arg138: tensor<1x512x7x7xf16>, %arg139: tensor<1x512xf16>, %arg140: tensor<512x1000xf16>, %arg141: tensor<1x1000xf16>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>) attributes {__placeholder__byre.entry_point} { %0 = tensor.empty() : tensor<1x512xf16> %1 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%arg141, %arg140 : tensor<1x1000xf16>, tensor<512x1000xf16>) outs(%0 : tensor<1x512xf16>) : tensor<1x512xf16> @@ -450,7 +687,7 @@ module { %26 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%24#0, %arg128 : tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%25 : tensor<1x512x7x7xf16>) : tensor<1x512x7x7xf16> %27 = tensor.empty() : tensor<512x512x3x3xf16> %28 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg127, %24#0 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%27 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16> - %29 = call @Unknown12(%arg127, %26) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %29 = call @Unknown4(%arg127, %26) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> %30 = tensor.empty() : tensor<1x512x7x7xf16> %31 = tensor.empty() : tensor<512xf32> %32 = tensor.empty() : tensor<512xf32> @@ -485,7 +722,7 @@ module { %61 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%59#0, %arg119 : tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%60 : tensor<1x256x14x14xf16>) : tensor<1x256x14x14xf16> %62 = tensor.empty() : tensor<256x256x3x3xf16> %63 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg118, %59#0 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%62 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16> - %64 = call @Unknown27(%46, %61, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %64 = call @Unknown19(%46, %61, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> %65 = tensor.empty() : tensor<1x256x14x14xf16> %66 = tensor.empty() : tensor<256xf32> %67 = tensor.empty() : tensor<256xf32> @@ -494,7 +731,7 @@ module { %70 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%68#0, %arg114 : tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%69 : tensor<1x256x14x14xf16>) : tensor<1x256x14x14xf16> %71 = tensor.empty() : tensor<256x256x3x3xf16> %72 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg113, %68#0 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%71 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16> - %73 = call @Unknown31(%arg113, %70) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %73 = call @Unknown23(%arg113, %70) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> %74 = tensor.empty() : tensor<1x256x14x14xf16> %75 = tensor.empty() : tensor<256xf32> %76 = tensor.empty() : tensor<256xf32> @@ -529,7 +766,7 @@ module { %105 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%103#0, %arg105 : tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%104 : tensor<1x128x28x28xf16>) : tensor<1x128x28x28xf16> %106 = tensor.empty() : tensor<128x128x3x3xf16> %107 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg104, %103#0 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%106 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16> - %108 = call @Unknown46(%90, %105, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %108 = call @Unknown38(%90, %105, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> %109 = tensor.empty() : tensor<1x128x28x28xf16> %110 = tensor.empty() : tensor<128xf32> %111 = tensor.empty() : tensor<128xf32> @@ -538,7 +775,7 @@ module { %114 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%112#0, %arg100 : tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%113 : tensor<1x128x28x28xf16>) : tensor<1x128x28x28xf16> %115 = tensor.empty() : tensor<128x128x3x3xf16> %116 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg99, %112#0 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%115 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16> - %117 = call @Unknown50(%arg99, %114) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %117 = call @Unknown42(%arg99, %114) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> %118 = tensor.empty() : tensor<1x128x28x28xf16> %119 = tensor.empty() : tensor<128xf32> %120 = tensor.empty() : tensor<128xf32> @@ -573,7 +810,7 @@ module { %149 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%147#0, %arg91 : tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%148 : tensor<1x64x56x56xf16>) : tensor<1x64x56x56xf16> %150 = tensor.empty() : tensor<64x64x3x3xf16> %151 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg90, %147#0 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%150 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16> - %152 = call @Unknown65(%134, %149, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %152 = call @Unknown57(%134, %149, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> %153 = tensor.empty() : tensor<1x64x56x56xf16> %154 = tensor.empty() : tensor<64xf32> %155 = tensor.empty() : tensor<64xf32> @@ -582,7 +819,7 @@ module { %158 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%156#0, %arg88 : tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%157 : tensor<1x64x56x56xf16>) : tensor<1x64x56x56xf16> %159 = tensor.empty() : tensor<64x64x3x3xf16> %160 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg87, %156#0 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%159 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16> - %161 = call @Unknown69(%arg87, %158) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %161 = call @Unknown61(%arg87, %158) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> %162 = tensor.empty() : tensor<1x64x56x56xf16> %163 = tensor.empty() : tensor<64xf32> %164 = tensor.empty() : tensor<64xf32> @@ -602,34 +839,31 @@ module { %178 = tensor.empty() : tensor<64x3x7x7xf16> %179 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%arg81, %177#0 : tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>) outs(%178 : tensor<64x3x7x7xf16>) : tensor<64x3x7x7xf16> %180 = call @Unknown77(%179) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> - %181 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1x1000xf32> - %182 = tensor.empty() : tensor<1000xf32> - %183 = byre.compute_on_tensor @ReduceSumOp_f32_f32 {dimensions = dense<0> : tensor<1xi64>} ins(%181 : tensor<1x1000xf32>) outs(%182 : tensor<1000xf32>) : tensor<1000xf32> - %184 = call @Unknown79(%183) : (tensor<1000xf32>) -> tensor<1000xf32> + %181 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1000xf32> %collapsed = tensor.collapse_shape %arg141 [[0, 1]] : tensor<1x1000xf16> into tensor<1000xf16> %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<1000xf16> into tensor<1000x1xf16> - %185 = tensor.empty() : tensor<1000x512xf16> - %186 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%expanded, %arg139 : tensor<1000x1xf16>, tensor<1x512xf16>) outs(%185 : tensor<1000x512xf16>) : tensor<1000x512xf16> - %187 = call @Unknown80(%186) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> - %188 = call @Unknown81(%169) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %189 = call @Unknown82(%160) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %190 = call @Unknown83(%151) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %191 = call @Unknown84(%142) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %192 = call @Unknown85(%125) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> - %193 = call @Unknown86(%116) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %194 = call @Unknown87(%133) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> - %195 = call @Unknown88(%107) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %196 = call @Unknown89(%98) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %197 = call @Unknown90(%81) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> - %198 = call @Unknown91(%72) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %199 = call @Unknown92(%89) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> - %200 = call @Unknown93(%63) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %201 = call @Unknown94(%54) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %202 = call @Unknown95(%37) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> - %203 = call @Unknown96(%28) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %204 = call @Unknown97(%45) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> - %205 = call @Unknown98(%19) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %206 = call @Unknown99(%10) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - return %177#2, %177#1, %180, %184, %187, %165#2, %165#1, %156#2, %156#1, %188, %189, %147#2, %147#1, %138#2, %138#1, %190, %191, %121#2, %121#1, %112#2, %112#1, %192, %193, %194, %129#2, %129#1, %103#2, %103#1, %94#2, %94#1, %195, %196, %77#2, %77#1, %68#2, %68#1, %197, %198, %199, %85#2, %85#1, %59#2, %59#1, %50#2, %50#1, %200, %201, %33#2, %33#1, %24#2, %24#1, %202, %203, %204, %41#2, %41#1, %15#2, %15#1, %6#2, %6#1, %205, %206 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32> - } -} + %182 = tensor.empty() : tensor<1000x512xf16> + %183 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%expanded, %arg139 : tensor<1000x1xf16>, tensor<1x512xf16>) outs(%182 : tensor<1000x512xf16>) : tensor<1000x512xf16> + %184 = call @Unknown79(%183) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> + %185 = call @Unknown80(%169) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %186 = call @Unknown80(%160) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %187 = call @Unknown80(%151) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %188 = call @Unknown80(%142) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %189 = call @Unknown84(%125) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> + %190 = call @Unknown85(%116) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %191 = call @Unknown86(%133) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> + %192 = call @Unknown85(%107) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %193 = call @Unknown85(%98) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %194 = call @Unknown89(%81) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> + %195 = call @Unknown90(%72) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %196 = call @Unknown91(%89) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> + %197 = call @Unknown90(%63) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %198 = call @Unknown90(%54) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %199 = call @Unknown94(%37) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> + %200 = call @Unknown95(%28) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %201 = call @Unknown96(%45) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> + %202 = call @Unknown95(%19) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %203 = call @Unknown95(%10) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + return %177#2, %177#1, %180, %181, %184, %165#2, %165#1, %156#2, %156#1, %185, %186, %147#2, %147#1, %138#2, %138#1, %187, %188, %121#2, %121#1, %112#2, %112#1, %189, %190, %191, %129#2, %129#1, %103#2, %103#1, %94#2, %94#1, %192, %193, %77#2, %77#1, %68#2, %68#1, %194, %195, %196, %85#2, %85#1, %59#2, %59#1, %50#2, %50#1, %197, %198, %33#2, %33#1, %24#2, %24#1, %199, %200, %201, %41#2, %41#1, %15#2, %15#1, %6#2, %6#1, %202, %203 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32> + } +} \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/5_affine_opt.mlir b/compiler/test/E2E/ResNet18/BW/5_affine_opt.mlir index de9aece87..f7d58879d 100644 --- a/compiler/test/E2E/ResNet18/BW/5_affine_opt.mlir +++ b/compiler/test/E2E/ResNet18/BW/5_affine_opt.mlir @@ -2,424 +2,563 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -#map2 = affine_map<(d0, d1) -> (d0, d1)> -#map3 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> module { func.func private @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 4.900000e+01 : f16 %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : memref<1x512x7x7xf16>, memref<1x512xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_1: f16, %out: f16): - %0 = arith.divf %in_1, %cst : f16 - %1 = arith.cmpf ogt, %in, %cst_0 : f16 - %2 = arith.select %1, %0, %cst_0 : f16 - linalg.yield %2 : f16 + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[0, %arg2] [1, 1] [1, 1] : memref<1x512xf16> to memref> + %subview_1 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_2 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f16, %in_3: f16, %out: f16): + %0 = arith.divf %in, %cst : f16 + %1 = arith.cmpf ogt, %in_3, %cst_0 : f16 + %2 = arith.select %1, %0, %cst_0 : f16 + linalg.yield %2 : f16 + } + } + } } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.cmpf ogt, %in, %cst : f16 + %1 = arith.select %0, %in_2, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 - } - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg3 = %c0 to %c512 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + scf.for %arg5 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.cmpf ogt, %in_4, %cst : f16 + %2 = arith.select %1, %0, %cst : f16 + linalg.yield %2 : f16 + } + } + } } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 + scf.for %arg3 = %c0 to %c256 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + scf.for %arg5 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.cmpf ogt, %in_4, %cst : f16 + %2 = arith.select %1, %0, %cst : f16 + linalg.yield %2 : f16 + } + } + } } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c14 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.cmpf ogt, %in, %cst : f16 + %1 = arith.select %0, %in_2, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 + scf.for %arg3 = %c0 to %c128 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + scf.for %arg5 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.cmpf ogt, %in_4, %cst : f16 + %2 = arith.select %1, %0, %cst : f16 + linalg.yield %2 : f16 + } + } + } } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c28 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.cmpf ogt, %in, %cst : f16 + %1 = arith.select %0, %in_2, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 + scf.for %arg3 = %c0 to %c64 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + scf.for %arg5 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.cmpf ogt, %in_4, %cst : f16 + %2 = arith.select %1, %0, %cst : f16 + linalg.yield %2 : f16 + } + } + } } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c56 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.cmpf ogt, %in, %cst : f16 + %1 = arith.select %0, %in_2, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c56 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %alloc = memref.alloc() : memref<1x64x112x112xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x112x112xf16>, memref<1x64x112x112xf16>) outs(%alloc : memref<1x64x112x112xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c112 step %c1 { + scf.for %arg4 = %c0 to %c112 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.cmpf ogt, %in, %cst : f16 + %1 = arith.select %0, %in_2, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x64x112x112xf16> } func.func private @Unknown77(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<64x3x7x7xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf16>) outs(%alloc : memref<64x3x7x7xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c3 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<64x3x7x7xf32> } - func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1x1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<1x1000xf32> - linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1x1000xf16>) outs(%alloc : memref<1x1000xf32>) { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<1x1000xf32> - } - func.func private @Unknown79(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<1000xf32> - linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%arg0 : memref<1000xf32>) outs(%alloc : memref<1000xf32>) { - ^bb0(%in: f32, %out: f32): - %0 = arith.truncf %in : f32 to f16 - %1 = arith.extf %0 : f16 to f32 - linalg.yield %1 : f32 - } - return %alloc : memref<1000xf32> - } - func.func private @Unknown80(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + scf.for %arg1 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[0, %arg1] [1, 1] [1, 1] : memref<1x1000xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1] [1, 1] [1, 1] : memref<1x1000xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + %1 = arith.truncf %0 : f32 to f16 + %2 = arith.extf %1 : f16 to f32 + linalg.yield %2 : f32 + } + } + %collapse_shape = memref.collapse_shape %alloc [[0, 1]] : memref<1x1000xf32> into memref<1000xf32> + return %collapse_shape : memref<1000xf32> + } + func.func private @Unknown79(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<1000x512xf32> - linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf16>) outs(%alloc : memref<1000x512xf32>) { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c1000 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<1000x512xf32> } - func.func private @Unknown81(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown82(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown80(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<64x64x3x3xf32> } - func.func private @Unknown83(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown84(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown85(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown84(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf16>) outs(%alloc : memref<128x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<128x64x3x3xf32> } - func.func private @Unknown86(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown85(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<128x128x3x3xf32> } - func.func private @Unknown87(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown86(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<128x64x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf16>) outs(%alloc : memref<128x64x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<128x64x1x1xf32> } - func.func private @Unknown88(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown89(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown90(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown89(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf16>) outs(%alloc : memref<256x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<256x128x3x3xf32> } - func.func private @Unknown91(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown90(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<256x256x3x3xf32> } - func.func private @Unknown92(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown91(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<256x128x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf16>) outs(%alloc : memref<256x128x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<256x128x1x1xf32> } - func.func private @Unknown93(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown94(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown95(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown94(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf16>) outs(%alloc : memref<512x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<512x256x3x3xf32> } - func.func private @Unknown96(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown95(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<512x512x3x3xf32> } - func.func private @Unknown97(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown96(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf16>) outs(%alloc : memref<512x256x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<512x256x1x1xf32> } - func.func private @Unknown98(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown99(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<512x512x3x3xf32> - } func.func @main(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: memref<64xf32>, %arg9: memref<64xf32>, %arg10: memref<128xf32>, %arg11: memref<128xf32>, %arg12: memref<128xf32>, %arg13: memref<128xf32>, %arg14: memref<128xf32>, %arg15: memref<128xf32>, %arg16: memref<128xf32>, %arg17: memref<128xf32>, %arg18: memref<128xf32>, %arg19: memref<128xf32>, %arg20: memref<256xf32>, %arg21: memref<256xf32>, %arg22: memref<256xf32>, %arg23: memref<256xf32>, %arg24: memref<256xf32>, %arg25: memref<256xf32>, %arg26: memref<256xf32>, %arg27: memref<256xf32>, %arg28: memref<256xf32>, %arg29: memref<256xf32>, %arg30: memref<512xf32>, %arg31: memref<512xf32>, %arg32: memref<512xf32>, %arg33: memref<512xf32>, %arg34: memref<512xf32>, %arg35: memref<512xf32>, %arg36: memref<512xf32>, %arg37: memref<512xf32>, %arg38: memref<512xf32>, %arg39: memref<512xf32>, %arg40: memref<64xf32>, %arg41: memref<64xf32>, %arg42: memref<64xf32>, %arg43: memref<64xf32>, %arg44: memref<64xf32>, %arg45: memref<64xf32>, %arg46: memref<64xf32>, %arg47: memref<64xf32>, %arg48: memref<64xf32>, %arg49: memref<64xf32>, %arg50: memref<128xf32>, %arg51: memref<128xf32>, %arg52: memref<128xf32>, %arg53: memref<128xf32>, %arg54: memref<128xf32>, %arg55: memref<128xf32>, %arg56: memref<128xf32>, %arg57: memref<128xf32>, %arg58: memref<128xf32>, %arg59: memref<128xf32>, %arg60: memref<256xf32>, %arg61: memref<256xf32>, %arg62: memref<256xf32>, %arg63: memref<256xf32>, %arg64: memref<256xf32>, %arg65: memref<256xf32>, %arg66: memref<256xf32>, %arg67: memref<256xf32>, %arg68: memref<256xf32>, %arg69: memref<256xf32>, %arg70: memref<512xf32>, %arg71: memref<512xf32>, %arg72: memref<512xf32>, %arg73: memref<512xf32>, %arg74: memref<512xf32>, %arg75: memref<512xf32>, %arg76: memref<512xf32>, %arg77: memref<512xf32>, %arg78: memref<512xf32>, %arg79: memref<512xf32>, %arg80: memref<64x3x7x7xf16>, %arg81: memref<1x3x224x224xf16>, %arg82: memref<1x64x112x112xf16>, %arg83: memref<1x64x112x112xf16>, %arg84: memref<1x64x56x56xf16>, %arg85: memref<64x64x3x3xf16>, %arg86: memref<1x64x56x56xf16>, %arg87: memref<1x64x56x56xf16>, %arg88: memref<64x64x3x3xf16>, %arg89: memref<1x64x56x56xf16>, %arg90: memref<1x64x56x56xf16>, %arg91: memref<64x64x3x3xf16>, %arg92: memref<1x64x56x56xf16>, %arg93: memref<1x64x56x56xf16>, %arg94: memref<64x64x3x3xf16>, %arg95: memref<1x64x56x56xf16>, %arg96: memref<1x64x56x56xf16>, %arg97: memref<128x64x3x3xf16>, %arg98: memref<1x128x28x28xf16>, %arg99: memref<1x128x28x28xf16>, %arg100: memref<128x128x3x3xf16>, %arg101: memref<1x128x28x28xf16>, %arg102: memref<128x64x1x1xf16>, %arg103: memref<1x128x28x28xf16>, %arg104: memref<1x128x28x28xf16>, %arg105: memref<128x128x3x3xf16>, %arg106: memref<1x128x28x28xf16>, %arg107: memref<1x128x28x28xf16>, %arg108: memref<128x128x3x3xf16>, %arg109: memref<1x128x28x28xf16>, %arg110: memref<1x128x28x28xf16>, %arg111: memref<256x128x3x3xf16>, %arg112: memref<1x256x14x14xf16>, %arg113: memref<1x256x14x14xf16>, %arg114: memref<256x256x3x3xf16>, %arg115: memref<1x256x14x14xf16>, %arg116: memref<256x128x1x1xf16>, %arg117: memref<1x256x14x14xf16>, %arg118: memref<1x256x14x14xf16>, %arg119: memref<256x256x3x3xf16>, %arg120: memref<1x256x14x14xf16>, %arg121: memref<1x256x14x14xf16>, %arg122: memref<256x256x3x3xf16>, %arg123: memref<1x256x14x14xf16>, %arg124: memref<1x256x14x14xf16>, %arg125: memref<512x256x3x3xf16>, %arg126: memref<1x512x7x7xf16>, %arg127: memref<1x512x7x7xf16>, %arg128: memref<512x512x3x3xf16>, %arg129: memref<1x512x7x7xf16>, %arg130: memref<512x256x1x1xf16>, %arg131: memref<1x512x7x7xf16>, %arg132: memref<1x512x7x7xf16>, %arg133: memref<512x512x3x3xf16>, %arg134: memref<1x512x7x7xf16>, %arg135: memref<1x512x7x7xf16>, %arg136: memref<512x512x3x3xf16>, %arg137: memref<1x512x7x7xf16>, %arg138: memref<1x512x7x7xf16>, %arg139: memref<1x512xf16>, %arg140: memref<512x1000xf16>, %arg141: memref<1x1000xf16>) -> (memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>) attributes {__placeholder__byre.entry_point} { %alloc = memref.alloc() : memref<1x512xf16> byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16>, memref<512x1000xf16>, memref<1x512xf16> @@ -450,7 +589,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_10, %arg128, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_14 = memref.alloc() : memref<512x512x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %alloc_10, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16> - %3 = call @Unknown12(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %3 = call @Unknown4(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> %alloc_15 = memref.alloc() : memref<1x512x7x7xf16> %alloc_16 = memref.alloc() : memref<512xf32> %alloc_17 = memref.alloc() : memref<512xf32> @@ -485,7 +624,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_30, %arg119, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_34 = memref.alloc() : memref<256x256x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %alloc_30, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16> - %6 = call @Unknown27(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %6 = call @Unknown19(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %alloc_35 = memref.alloc() : memref<1x256x14x14xf16> %alloc_36 = memref.alloc() : memref<256xf32> %alloc_37 = memref.alloc() : memref<256xf32> @@ -494,7 +633,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_35, %arg114, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_39 = memref.alloc() : memref<256x256x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %alloc_35, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16> - %7 = call @Unknown31(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %7 = call @Unknown23(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %alloc_40 = memref.alloc() : memref<1x256x14x14xf16> %alloc_41 = memref.alloc() : memref<256xf32> %alloc_42 = memref.alloc() : memref<256xf32> @@ -529,7 +668,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_55, %arg105, %alloc_58) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_59 = memref.alloc() : memref<128x128x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %alloc_55, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16> - %10 = call @Unknown46(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %10 = call @Unknown38(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %alloc_60 = memref.alloc() : memref<1x128x28x28xf16> %alloc_61 = memref.alloc() : memref<128xf32> %alloc_62 = memref.alloc() : memref<128xf32> @@ -538,7 +677,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_60, %arg100, %alloc_63) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_64 = memref.alloc() : memref<128x128x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %alloc_60, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16> - %11 = call @Unknown50(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %11 = call @Unknown42(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %alloc_65 = memref.alloc() : memref<1x128x28x28xf16> %alloc_66 = memref.alloc() : memref<128xf32> %alloc_67 = memref.alloc() : memref<128xf32> @@ -573,7 +712,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_80, %arg91, %alloc_83) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_84 = memref.alloc() : memref<64x64x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %alloc_80, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16> - %14 = call @Unknown65(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %14 = call @Unknown57(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %alloc_85 = memref.alloc() : memref<1x64x56x56xf16> %alloc_86 = memref.alloc() : memref<64xf32> %alloc_87 = memref.alloc() : memref<64xf32> @@ -582,7 +721,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_85, %arg88, %alloc_88) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_89 = memref.alloc() : memref<64x64x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %alloc_85, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16> - %15 = call @Unknown69(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %15 = call @Unknown61(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %alloc_90 = memref.alloc() : memref<1x64x56x56xf16> %alloc_91 = memref.alloc() : memref<64xf32> %alloc_92 = memref.alloc() : memref<64xf32> @@ -602,34 +741,31 @@ module { %alloc_99 = memref.alloc() : memref<64x3x7x7xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %alloc_96, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<64x3x7x7xf16> %18 = call @Unknown77(%alloc_99) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> - %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1x1000xf32> - %alloc_100 = memref.alloc() : memref<1000xf32> - byre.compute @ReduceSumOp_f32_f32(%19, %alloc_100) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32>, memref<1000xf32> - %20 = call @Unknown79(%alloc_100) : (memref<1000xf32>) -> memref<1000xf32> + %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1000xf32> %collapse_shape = memref.collapse_shape %arg141 [[0, 1]] : memref<1x1000xf16> into memref<1000xf16> %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<1000xf16> into memref<1000x1xf16> - %alloc_101 = memref.alloc() : memref<1000x512xf16> - byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_101) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16> - %21 = call @Unknown80(%alloc_101) : (memref<1000x512xf16>) -> memref<1000x512xf32> - %22 = call @Unknown81(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %23 = call @Unknown82(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %24 = call @Unknown83(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %25 = call @Unknown84(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %26 = call @Unknown85(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> - %27 = call @Unknown86(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %28 = call @Unknown87(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> - %29 = call @Unknown88(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %30 = call @Unknown89(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %31 = call @Unknown90(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> - %32 = call @Unknown91(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %33 = call @Unknown92(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> - %34 = call @Unknown93(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %35 = call @Unknown94(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %36 = call @Unknown95(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> - %37 = call @Unknown96(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %38 = call @Unknown97(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> - %39 = call @Unknown98(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %40 = call @Unknown99(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - return %alloc_98, %alloc_97, %18, %20, %21, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %22, %23, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %24, %25, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %26, %27, %28, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %29, %30, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %31, %32, %33, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %34, %35, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %36, %37, %38, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %39, %40 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32> + %alloc_100 = memref.alloc() : memref<1000x512xf16> + byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_100) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16> + %20 = call @Unknown79(%alloc_100) : (memref<1000x512xf16>) -> memref<1000x512xf32> + %21 = call @Unknown80(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %22 = call @Unknown80(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %23 = call @Unknown80(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %24 = call @Unknown80(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %25 = call @Unknown84(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> + %26 = call @Unknown85(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %27 = call @Unknown86(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> + %28 = call @Unknown85(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %29 = call @Unknown85(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %30 = call @Unknown89(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> + %31 = call @Unknown90(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %32 = call @Unknown91(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> + %33 = call @Unknown90(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %34 = call @Unknown90(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %35 = call @Unknown94(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> + %36 = call @Unknown95(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %37 = call @Unknown96(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> + %38 = call @Unknown95(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %39 = call @Unknown95(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + return %alloc_98, %alloc_97, %18, %19, %20, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %21, %22, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %23, %24, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %25, %26, %27, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %28, %29, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %30, %31, %32, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %33, %34, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %35, %36, %37, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %38, %39 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/5_alternative_scf_opt.mlir b/compiler/test/E2E/ResNet18/BW/5_alternative_scf_opt.mlir index 5206f661d..ef34370db 100644 --- a/compiler/test/E2E/ResNet18/BW/5_alternative_scf_opt.mlir +++ b/compiler/test/E2E/ResNet18/BW/5_alternative_scf_opt.mlir @@ -2,424 +2,563 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -#map2 = affine_map<(d0, d1) -> (d0, d1)> -#map3 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> module { func.func private @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 4.900000e+01 : f16 %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : memref<1x512x7x7xf16>, memref<1x512xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_1: f16, %out: f16): - %0 = arith.divf %in_1, %cst : f16 - %1 = arith.cmpf ogt, %in, %cst_0 : f16 - %2 = arith.select %1, %0, %cst_0 : f16 - linalg.yield %2 : f16 + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[0, %arg2] [1, 1] [1, 1] : memref<1x512xf16> to memref> + %subview_1 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_2 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f16, %in_3: f16, %out: f16): + %0 = arith.divf %in, %cst : f16 + %1 = arith.cmpf ogt, %in_3, %cst_0 : f16 + %2 = arith.select %1, %0, %cst_0 : f16 + linalg.yield %2 : f16 + } + } + } } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.cmpf ogt, %in, %cst : f16 + %1 = arith.select %0, %in_2, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 - } - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg3 = %c0 to %c512 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + scf.for %arg5 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.cmpf ogt, %in_4, %cst : f16 + %2 = arith.select %1, %0, %cst : f16 + linalg.yield %2 : f16 + } + } + } } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 + scf.for %arg3 = %c0 to %c256 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + scf.for %arg5 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.cmpf ogt, %in_4, %cst : f16 + %2 = arith.select %1, %0, %cst : f16 + linalg.yield %2 : f16 + } + } + } } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c14 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.cmpf ogt, %in, %cst : f16 + %1 = arith.select %0, %in_2, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 + scf.for %arg3 = %c0 to %c128 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + scf.for %arg5 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.cmpf ogt, %in_4, %cst : f16 + %2 = arith.select %1, %0, %cst : f16 + linalg.yield %2 : f16 + } + } + } } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c28 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.cmpf ogt, %in, %cst : f16 + %1 = arith.select %0, %in_2, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 + scf.for %arg3 = %c0 to %c64 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + scf.for %arg5 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.cmpf ogt, %in_4, %cst : f16 + %2 = arith.select %1, %0, %cst : f16 + linalg.yield %2 : f16 + } + } + } } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.cmpf ogt, %in, %cst : f16 - %2 = arith.select %1, %0, %cst : f16 - linalg.yield %2 : f16 - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c56 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.cmpf ogt, %in, %cst : f16 + %1 = arith.select %0, %in_2, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c56 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %alloc = memref.alloc() : memref<1x64x112x112xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x112x112xf16>, memref<1x64x112x112xf16>) outs(%alloc : memref<1x64x112x112xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.cmpf ogt, %in, %cst : f16 - %1 = arith.select %0, %in_0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c112 step %c1 { + scf.for %arg4 = %c0 to %c112 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.cmpf ogt, %in, %cst : f16 + %1 = arith.select %0, %in_2, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x64x112x112xf16> } func.func private @Unknown77(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<64x3x7x7xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf16>) outs(%alloc : memref<64x3x7x7xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c3 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<64x3x7x7xf32> } - func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1x1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<1x1000xf32> - linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1x1000xf16>) outs(%alloc : memref<1x1000xf32>) { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<1x1000xf32> - } - func.func private @Unknown79(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<1000xf32> - linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%arg0 : memref<1000xf32>) outs(%alloc : memref<1000xf32>) { - ^bb0(%in: f32, %out: f32): - %0 = arith.truncf %in : f32 to f16 - %1 = arith.extf %0 : f16 to f32 - linalg.yield %1 : f32 - } - return %alloc : memref<1000xf32> - } - func.func private @Unknown80(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + scf.for %arg1 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[0, %arg1] [1, 1] [1, 1] : memref<1x1000xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1] [1, 1] [1, 1] : memref<1x1000xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + %1 = arith.truncf %0 : f32 to f16 + %2 = arith.extf %1 : f16 to f32 + linalg.yield %2 : f32 + } + } + %collapse_shape = memref.collapse_shape %alloc [[0, 1]] : memref<1x1000xf32> into memref<1000xf32> + return %collapse_shape : memref<1000xf32> + } + func.func private @Unknown79(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<1000x512xf32> - linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf16>) outs(%alloc : memref<1000x512xf32>) { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c1000 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<1000x512xf32> } - func.func private @Unknown81(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown82(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown80(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<64x64x3x3xf32> } - func.func private @Unknown83(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown84(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown85(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown84(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf16>) outs(%alloc : memref<128x64x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<128x64x3x3xf32> } - func.func private @Unknown86(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown85(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<128x128x3x3xf32> } - func.func private @Unknown87(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown86(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<128x64x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf16>) outs(%alloc : memref<128x64x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<128x64x1x1xf32> } - func.func private @Unknown88(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown89(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown90(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown89(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf16>) outs(%alloc : memref<256x128x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<256x128x3x3xf32> } - func.func private @Unknown91(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown90(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<256x256x3x3xf32> } - func.func private @Unknown92(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown91(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<256x128x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf16>) outs(%alloc : memref<256x128x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<256x128x1x1xf32> } - func.func private @Unknown93(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown94(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown95(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown94(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf16>) outs(%alloc : memref<512x256x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<512x256x3x3xf32> } - func.func private @Unknown96(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown95(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<512x512x3x3xf32> } - func.func private @Unknown97(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown96(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf16>) outs(%alloc : memref<512x256x1x1xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<512x256x1x1xf32> } - func.func private @Unknown98(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown99(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<512x512x3x3xf32> - } func.func @main(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: memref<64xf32>, %arg9: memref<64xf32>, %arg10: memref<128xf32>, %arg11: memref<128xf32>, %arg12: memref<128xf32>, %arg13: memref<128xf32>, %arg14: memref<128xf32>, %arg15: memref<128xf32>, %arg16: memref<128xf32>, %arg17: memref<128xf32>, %arg18: memref<128xf32>, %arg19: memref<128xf32>, %arg20: memref<256xf32>, %arg21: memref<256xf32>, %arg22: memref<256xf32>, %arg23: memref<256xf32>, %arg24: memref<256xf32>, %arg25: memref<256xf32>, %arg26: memref<256xf32>, %arg27: memref<256xf32>, %arg28: memref<256xf32>, %arg29: memref<256xf32>, %arg30: memref<512xf32>, %arg31: memref<512xf32>, %arg32: memref<512xf32>, %arg33: memref<512xf32>, %arg34: memref<512xf32>, %arg35: memref<512xf32>, %arg36: memref<512xf32>, %arg37: memref<512xf32>, %arg38: memref<512xf32>, %arg39: memref<512xf32>, %arg40: memref<64xf32>, %arg41: memref<64xf32>, %arg42: memref<64xf32>, %arg43: memref<64xf32>, %arg44: memref<64xf32>, %arg45: memref<64xf32>, %arg46: memref<64xf32>, %arg47: memref<64xf32>, %arg48: memref<64xf32>, %arg49: memref<64xf32>, %arg50: memref<128xf32>, %arg51: memref<128xf32>, %arg52: memref<128xf32>, %arg53: memref<128xf32>, %arg54: memref<128xf32>, %arg55: memref<128xf32>, %arg56: memref<128xf32>, %arg57: memref<128xf32>, %arg58: memref<128xf32>, %arg59: memref<128xf32>, %arg60: memref<256xf32>, %arg61: memref<256xf32>, %arg62: memref<256xf32>, %arg63: memref<256xf32>, %arg64: memref<256xf32>, %arg65: memref<256xf32>, %arg66: memref<256xf32>, %arg67: memref<256xf32>, %arg68: memref<256xf32>, %arg69: memref<256xf32>, %arg70: memref<512xf32>, %arg71: memref<512xf32>, %arg72: memref<512xf32>, %arg73: memref<512xf32>, %arg74: memref<512xf32>, %arg75: memref<512xf32>, %arg76: memref<512xf32>, %arg77: memref<512xf32>, %arg78: memref<512xf32>, %arg79: memref<512xf32>, %arg80: memref<64x3x7x7xf16>, %arg81: memref<1x3x224x224xf16>, %arg82: memref<1x64x112x112xf16>, %arg83: memref<1x64x112x112xf16>, %arg84: memref<1x64x56x56xf16>, %arg85: memref<64x64x3x3xf16>, %arg86: memref<1x64x56x56xf16>, %arg87: memref<1x64x56x56xf16>, %arg88: memref<64x64x3x3xf16>, %arg89: memref<1x64x56x56xf16>, %arg90: memref<1x64x56x56xf16>, %arg91: memref<64x64x3x3xf16>, %arg92: memref<1x64x56x56xf16>, %arg93: memref<1x64x56x56xf16>, %arg94: memref<64x64x3x3xf16>, %arg95: memref<1x64x56x56xf16>, %arg96: memref<1x64x56x56xf16>, %arg97: memref<128x64x3x3xf16>, %arg98: memref<1x128x28x28xf16>, %arg99: memref<1x128x28x28xf16>, %arg100: memref<128x128x3x3xf16>, %arg101: memref<1x128x28x28xf16>, %arg102: memref<128x64x1x1xf16>, %arg103: memref<1x128x28x28xf16>, %arg104: memref<1x128x28x28xf16>, %arg105: memref<128x128x3x3xf16>, %arg106: memref<1x128x28x28xf16>, %arg107: memref<1x128x28x28xf16>, %arg108: memref<128x128x3x3xf16>, %arg109: memref<1x128x28x28xf16>, %arg110: memref<1x128x28x28xf16>, %arg111: memref<256x128x3x3xf16>, %arg112: memref<1x256x14x14xf16>, %arg113: memref<1x256x14x14xf16>, %arg114: memref<256x256x3x3xf16>, %arg115: memref<1x256x14x14xf16>, %arg116: memref<256x128x1x1xf16>, %arg117: memref<1x256x14x14xf16>, %arg118: memref<1x256x14x14xf16>, %arg119: memref<256x256x3x3xf16>, %arg120: memref<1x256x14x14xf16>, %arg121: memref<1x256x14x14xf16>, %arg122: memref<256x256x3x3xf16>, %arg123: memref<1x256x14x14xf16>, %arg124: memref<1x256x14x14xf16>, %arg125: memref<512x256x3x3xf16>, %arg126: memref<1x512x7x7xf16>, %arg127: memref<1x512x7x7xf16>, %arg128: memref<512x512x3x3xf16>, %arg129: memref<1x512x7x7xf16>, %arg130: memref<512x256x1x1xf16>, %arg131: memref<1x512x7x7xf16>, %arg132: memref<1x512x7x7xf16>, %arg133: memref<512x512x3x3xf16>, %arg134: memref<1x512x7x7xf16>, %arg135: memref<1x512x7x7xf16>, %arg136: memref<512x512x3x3xf16>, %arg137: memref<1x512x7x7xf16>, %arg138: memref<1x512x7x7xf16>, %arg139: memref<1x512xf16>, %arg140: memref<512x1000xf16>, %arg141: memref<1x1000xf16>) -> (memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>) attributes {__placeholder__byre.entry_point} { %alloc = memref.alloc() : memref<1x512xf16> byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16>, memref<512x1000xf16>, memref<1x512xf16> @@ -450,7 +589,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_10, %arg128, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_14 = memref.alloc() : memref<512x512x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %alloc_10, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16> - %3 = call @Unknown12(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %3 = call @Unknown4(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> %alloc_15 = memref.alloc() : memref<1x512x7x7xf16> %alloc_16 = memref.alloc() : memref<512xf32> %alloc_17 = memref.alloc() : memref<512xf32> @@ -485,7 +624,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_30, %arg119, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_34 = memref.alloc() : memref<256x256x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %alloc_30, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16> - %6 = call @Unknown27(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %6 = call @Unknown19(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %alloc_35 = memref.alloc() : memref<1x256x14x14xf16> %alloc_36 = memref.alloc() : memref<256xf32> %alloc_37 = memref.alloc() : memref<256xf32> @@ -494,7 +633,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_35, %arg114, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_39 = memref.alloc() : memref<256x256x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %alloc_35, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16> - %7 = call @Unknown31(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %7 = call @Unknown23(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %alloc_40 = memref.alloc() : memref<1x256x14x14xf16> %alloc_41 = memref.alloc() : memref<256xf32> %alloc_42 = memref.alloc() : memref<256xf32> @@ -529,7 +668,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_55, %arg105, %alloc_58) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_59 = memref.alloc() : memref<128x128x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %alloc_55, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16> - %10 = call @Unknown46(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %10 = call @Unknown38(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %alloc_60 = memref.alloc() : memref<1x128x28x28xf16> %alloc_61 = memref.alloc() : memref<128xf32> %alloc_62 = memref.alloc() : memref<128xf32> @@ -538,7 +677,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_60, %arg100, %alloc_63) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_64 = memref.alloc() : memref<128x128x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %alloc_60, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16> - %11 = call @Unknown50(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %11 = call @Unknown42(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %alloc_65 = memref.alloc() : memref<1x128x28x28xf16> %alloc_66 = memref.alloc() : memref<128xf32> %alloc_67 = memref.alloc() : memref<128xf32> @@ -573,7 +712,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_80, %arg91, %alloc_83) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_84 = memref.alloc() : memref<64x64x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %alloc_80, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16> - %14 = call @Unknown65(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %14 = call @Unknown57(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %alloc_85 = memref.alloc() : memref<1x64x56x56xf16> %alloc_86 = memref.alloc() : memref<64xf32> %alloc_87 = memref.alloc() : memref<64xf32> @@ -582,7 +721,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_85, %arg88, %alloc_88) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_89 = memref.alloc() : memref<64x64x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %alloc_85, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16> - %15 = call @Unknown69(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %15 = call @Unknown61(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %alloc_90 = memref.alloc() : memref<1x64x56x56xf16> %alloc_91 = memref.alloc() : memref<64xf32> %alloc_92 = memref.alloc() : memref<64xf32> @@ -602,34 +741,31 @@ module { %alloc_99 = memref.alloc() : memref<64x3x7x7xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %alloc_96, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<64x3x7x7xf16> %18 = call @Unknown77(%alloc_99) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> - %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1x1000xf32> - %alloc_100 = memref.alloc() : memref<1000xf32> - byre.compute @ReduceSumOp_f32_f32(%19, %alloc_100) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32>, memref<1000xf32> - %20 = call @Unknown79(%alloc_100) : (memref<1000xf32>) -> memref<1000xf32> + %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1000xf32> %collapse_shape = memref.collapse_shape %arg141 [[0, 1]] : memref<1x1000xf16> into memref<1000xf16> %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<1000xf16> into memref<1000x1xf16> - %alloc_101 = memref.alloc() : memref<1000x512xf16> - byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_101) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16> - %21 = call @Unknown80(%alloc_101) : (memref<1000x512xf16>) -> memref<1000x512xf32> - %22 = call @Unknown81(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %23 = call @Unknown82(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %24 = call @Unknown83(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %25 = call @Unknown84(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %26 = call @Unknown85(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> - %27 = call @Unknown86(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %28 = call @Unknown87(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> - %29 = call @Unknown88(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %30 = call @Unknown89(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %31 = call @Unknown90(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> - %32 = call @Unknown91(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %33 = call @Unknown92(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> - %34 = call @Unknown93(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %35 = call @Unknown94(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %36 = call @Unknown95(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> - %37 = call @Unknown96(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %38 = call @Unknown97(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> - %39 = call @Unknown98(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %40 = call @Unknown99(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - return %alloc_98, %alloc_97, %18, %20, %21, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %22, %23, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %24, %25, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %26, %27, %28, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %29, %30, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %31, %32, %33, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %34, %35, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %36, %37, %38, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %39, %40 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32> + %alloc_100 = memref.alloc() : memref<1000x512xf16> + byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_100) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16> + %20 = call @Unknown79(%alloc_100) : (memref<1000x512xf16>) -> memref<1000x512xf32> + %21 = call @Unknown80(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %22 = call @Unknown80(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %23 = call @Unknown80(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %24 = call @Unknown80(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %25 = call @Unknown84(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> + %26 = call @Unknown85(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %27 = call @Unknown86(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> + %28 = call @Unknown85(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %29 = call @Unknown85(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %30 = call @Unknown89(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> + %31 = call @Unknown90(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %32 = call @Unknown91(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> + %33 = call @Unknown90(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %34 = call @Unknown90(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %35 = call @Unknown94(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> + %36 = call @Unknown95(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %37 = call @Unknown96(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> + %38 = call @Unknown95(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %39 = call @Unknown95(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + return %alloc_98, %alloc_97, %18, %19, %20, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %21, %22, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %23, %24, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %25, %26, %27, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %28, %29, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %30, %31, %32, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %33, %34, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %35, %36, %37, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %38, %39 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/6_gpu_opt.mlir b/compiler/test/E2E/ResNet18/BW/6_gpu_opt.mlir index afd93d077..83a5ea971 100644 --- a/compiler/test/E2E/ResNet18/BW/6_gpu_opt.mlir +++ b/compiler/test/E2E/ResNet18/BW/6_gpu_opt.mlir @@ -4,1571 +4,468 @@ module { func.func private @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %cst_0 = arith.constant 4.900000e+01 : f16 - %c0 = arith.constant 0 : index %c25088 = arith.constant 25088 : index - %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> scf.for %arg2 = %c0 to %c25088 step %c1 { %0 = arith.remsi %arg2, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %21 = memref.load %arg0[%c0, %19] : memref<1x512xf16> - %22 = arith.divf %21, %cst_0 : f16 - %23 = arith.cmpf ogt, %20, %cst : f16 - %24 = arith.select %23, %22, %cst : f16 - memref.store %24, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16> + %1 = arith.divsi %arg2, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = memref.load %arg0[%c0, %3] : memref<1x512xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x512x7x7xf16> + %6 = arith.divf %4, %cst_0 : f16 + %7 = arith.cmpf ogt, %5, %cst : f16 + %8 = arith.select %7, %6, %cst : f16 + memref.store %8, %alloc[%c0, %3, %2, %0] : memref<1x512x7x7xf16> } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c7 = arith.constant 7 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c25088 = arith.constant 25088 : index - %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> scf.for %arg2 = %c0 to %c25088 step %c1 { %0 = arith.remsi %arg2, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %22 = arith.cmpf ogt, %20, %cst : f16 - %23 = arith.select %22, %21, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16> + %1 = arith.divsi %arg2, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x512x7x7xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x512x7x7xf16> + %6 = arith.cmpf ogt, %4, %cst : f16 + %7 = arith.select %6, %5, %cst : f16 + memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x512x7x7xf16> } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c7 = arith.constant 7 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c25088 = arith.constant 25088 : index - %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> scf.for %arg3 = %c0 to %c25088 step %c1 { %0 = arith.remsi %arg3, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %23 = arith.addf %21, %22 : f16 - %24 = arith.cmpf ogt, %20, %cst : f16 - %25 = arith.select %24, %23, %cst : f16 - memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - } - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x512x7x7xf16> - scf.for %arg2 = %c0 to %c25088 step %c1 { - %0 = arith.remsi %arg2, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %22 = arith.cmpf ogt, %20, %cst : f16 - %23 = arith.select %22, %21, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16> + %1 = arith.divsi %arg3, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x512x7x7xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x512x7x7xf16> + %6 = memref.load %arg2[%c0, %3, %2, %0] : memref<1x512x7x7xf16> + %7 = arith.addf %4, %5 : f16 + %8 = arith.cmpf ogt, %6, %cst : f16 + %9 = arith.select %8, %7, %cst : f16 + memref.store %9, %alloc[%c0, %3, %2, %0] : memref<1x512x7x7xf16> } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c14 = arith.constant 14 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c50176 = arith.constant 50176 : index - %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> scf.for %arg3 = %c0 to %c50176 step %c1 { %0 = arith.remsi %arg3, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %23 = arith.addf %21, %22 : f16 - %24 = arith.cmpf ogt, %20, %cst : f16 - %25 = arith.select %24, %23, %cst : f16 - memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16> + %1 = arith.divsi %arg3, %c14 : index + %2 = arith.remsi %1, %c14 : index + %3 = arith.divsi %1, %c14 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x256x14x14xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x256x14x14xf16> + %6 = memref.load %arg2[%c0, %3, %2, %0] : memref<1x256x14x14xf16> + %7 = arith.addf %4, %5 : f16 + %8 = arith.cmpf ogt, %6, %cst : f16 + %9 = arith.select %8, %7, %cst : f16 + memref.store %9, %alloc[%c0, %3, %2, %0] : memref<1x256x14x14xf16> } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c1 = arith.constant 1 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x256x14x14xf16> - scf.for %arg2 = %c0 to %c50176 step %c1 { - %0 = arith.remsi %arg2, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %22 = arith.cmpf ogt, %20, %cst : f16 - %23 = arith.select %22, %21, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x256x14x14xf16> - scf.for %arg3 = %c0 to %c50176 step %c1 { - %0 = arith.remsi %arg3, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %23 = arith.addf %21, %22 : f16 - %24 = arith.cmpf ogt, %20, %cst : f16 - %25 = arith.select %24, %23, %cst : f16 - memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c50176 = arith.constant 50176 : index - %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> scf.for %arg2 = %c0 to %c50176 step %c1 { %0 = arith.remsi %arg2, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %22 = arith.cmpf ogt, %20, %cst : f16 - %23 = arith.select %22, %21, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16> + %1 = arith.divsi %arg2, %c14 : index + %2 = arith.remsi %1, %c14 : index + %3 = arith.divsi %1, %c14 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x256x14x14xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x256x14x14xf16> + %6 = arith.cmpf ogt, %4, %cst : f16 + %7 = arith.select %6, %5, %cst : f16 + memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x256x14x14xf16> } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c28 = arith.constant 28 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> scf.for %arg3 = %c0 to %c100352 step %c1 { %0 = arith.remsi %arg3, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %23 = arith.addf %21, %22 : f16 - %24 = arith.cmpf ogt, %20, %cst : f16 - %25 = arith.select %24, %23, %cst : f16 - memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16> + %1 = arith.divsi %arg3, %c28 : index + %2 = arith.remsi %1, %c28 : index + %3 = arith.divsi %1, %c28 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x128x28x28xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x128x28x28xf16> + %6 = memref.load %arg2[%c0, %3, %2, %0] : memref<1x128x28x28xf16> + %7 = arith.addf %4, %5 : f16 + %8 = arith.cmpf ogt, %6, %cst : f16 + %9 = arith.select %8, %7, %cst : f16 + memref.store %9, %alloc[%c0, %3, %2, %0] : memref<1x128x28x28xf16> } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x128x28x28xf16> - scf.for %arg2 = %c0 to %c100352 step %c1 { - %0 = arith.remsi %arg2, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %22 = arith.cmpf ogt, %20, %cst : f16 - %23 = arith.select %22, %21, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c1 = arith.constant 1 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x128x28x28xf16> - scf.for %arg3 = %c0 to %c100352 step %c1 { - %0 = arith.remsi %arg3, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %23 = arith.addf %21, %22 : f16 - %24 = arith.cmpf ogt, %20, %cst : f16 - %25 = arith.select %24, %23, %cst : f16 - memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> scf.for %arg2 = %c0 to %c100352 step %c1 { %0 = arith.remsi %arg2, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %22 = arith.cmpf ogt, %20, %cst : f16 - %23 = arith.select %22, %21, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16> + %1 = arith.divsi %arg2, %c28 : index + %2 = arith.remsi %1, %c28 : index + %3 = arith.divsi %1, %c28 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x128x28x28xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x128x28x28xf16> + %6 = arith.cmpf ogt, %4, %cst : f16 + %7 = arith.select %6, %5, %cst : f16 + memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x128x28x28xf16> } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c56 = arith.constant 56 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> scf.for %arg3 = %c0 to %c200704 step %c1 { %0 = arith.remsi %arg3, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %23 = arith.addf %21, %22 : f16 - %24 = arith.cmpf ogt, %20, %cst : f16 - %25 = arith.select %24, %23, %cst : f16 - memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16> + %1 = arith.divsi %arg3, %c56 : index + %2 = arith.remsi %1, %c56 : index + %3 = arith.divsi %1, %c56 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x56x56xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x64x56x56xf16> + %6 = memref.load %arg2[%c0, %3, %2, %0] : memref<1x64x56x56xf16> + %7 = arith.addf %4, %5 : f16 + %8 = arith.cmpf ogt, %6, %cst : f16 + %9 = arith.select %8, %7, %cst : f16 + memref.store %9, %alloc[%c0, %3, %2, %0] : memref<1x64x56x56xf16> } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x64x56x56xf16> - scf.for %arg2 = %c0 to %c200704 step %c1 { - %0 = arith.remsi %arg2, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %22 = arith.cmpf ogt, %20, %cst : f16 - %23 = arith.select %22, %21, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x64x56x56xf16> - scf.for %arg3 = %c0 to %c200704 step %c1 { - %0 = arith.remsi %arg3, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %23 = arith.addf %21, %22 : f16 - %24 = arith.cmpf ogt, %20, %cst : f16 - %25 = arith.select %24, %23, %cst : f16 - memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> scf.for %arg2 = %c0 to %c200704 step %c1 { %0 = arith.remsi %arg2, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %22 = arith.cmpf ogt, %20, %cst : f16 - %23 = arith.select %22, %21, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16> + %1 = arith.divsi %arg2, %c56 : index + %2 = arith.remsi %1, %c56 : index + %3 = arith.divsi %1, %c56 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x56x56xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x64x56x56xf16> + %6 = arith.cmpf ogt, %4, %cst : f16 + %7 = arith.select %6, %5, %cst : f16 + memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x64x56x56xf16> } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> scf.for %arg2 = %c0 to %c200704 step %c1 { %0 = arith.remsi %arg2, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %22 = arith.addf %20, %21 : f16 - memref.store %22, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16> + %1 = arith.divsi %arg2, %c56 : index + %2 = arith.remsi %1, %c56 : index + %3 = arith.divsi %1, %c56 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x56x56xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x64x56x56xf16> + %6 = arith.addf %4, %5 : f16 + memref.store %6, %alloc[%c0, %3, %2, %0] : memref<1x64x56x56xf16> } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c112 = arith.constant 112 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c802816 = arith.constant 802816 : index - %c1 = arith.constant 1 : index - %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x64x112x112xf16> scf.for %arg2 = %c0 to %c802816 step %c1 { %0 = arith.remsi %arg2, %c112 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c112 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c112 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c112 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c112 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c112 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x112x112xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x112x112xf16> - %22 = arith.cmpf ogt, %20, %cst : f16 - %23 = arith.select %22, %21, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x64x112x112xf16> + %1 = arith.divsi %arg2, %c112 : index + %2 = arith.remsi %1, %c112 : index + %3 = arith.divsi %1, %c112 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x112x112xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x64x112x112xf16> + %6 = arith.cmpf ogt, %4, %cst : f16 + %7 = arith.select %6, %5, %cst : f16 + memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x64x112x112xf16> } return %alloc : memref<1x64x112x112xf16> } func.func private @Unknown77(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c1 = arith.constant 1 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c9408 = arith.constant 9408 : index %alloc = memref.alloc() : memref<64x3x7x7xf32> scf.for %arg1 = %c0 to %c9408 step %c1 { %0 = arith.remsi %arg1, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c3 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c3 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c3 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x3x7x7xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x3x7x7xf32> + %1 = arith.divsi %arg1, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = arith.remsi %3, %c3 : index + %5 = arith.divsi %3, %c3 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x3x7x7xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x3x7x7xf32> } return %alloc : memref<64x3x7x7xf32> } - func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1x1000xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index + func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { %c1000 = arith.constant 1000 : index %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %alloc = memref.alloc() : memref<1x1000xf32> scf.for %arg1 = %c0 to %c1000 step %c1 { %0 = memref.load %arg0[%c0, %arg1] : memref<1x1000xf16> %1 = arith.extf %0 : f16 to f32 - memref.store %1, %alloc[%c0, %arg1] : memref<1x1000xf32> + %2 = arith.truncf %1 : f32 to f16 + %3 = arith.extf %2 : f16 to f32 + memref.store %3, %alloc[%c0, %arg1] : memref<1x1000xf32> } - return %alloc : memref<1x1000xf32> + %collapse_shape = memref.collapse_shape %alloc [[0, 1]] : memref<1x1000xf32> into memref<1000xf32> + return %collapse_shape : memref<1000xf32> } - func.func private @Unknown79(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c1000 = arith.constant 1000 : index + func.func private @Unknown79(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<1000xf32> - scf.for %arg1 = %c0 to %c1000 step %c1 { - %0 = memref.load %arg0[%arg1] : memref<1000xf32> - %1 = arith.truncf %0 : f32 to f16 - %2 = arith.extf %1 : f16 to f32 - memref.store %2, %alloc[%arg1] : memref<1000xf32> - } - return %alloc : memref<1000xf32> - } - func.func private @Unknown80(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c512000 = arith.constant 512000 : index - %c1 = arith.constant 1 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1000x512xf32> scf.for %arg1 = %c0 to %c512000 step %c1 { %0 = arith.remsi %arg1, %c512 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c512 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c512 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3] : memref<1000x512xf16> - %11 = arith.extf %10 : f16 to f32 - memref.store %11, %alloc[%9, %3] : memref<1000x512xf32> + %1 = arith.divsi %arg1, %c512 : index + %2 = memref.load %arg0[%1, %0] : memref<1000x512xf16> + %3 = arith.extf %2 : f16 to f32 + memref.store %3, %alloc[%1, %0] : memref<1000x512xf32> } return %alloc : memref<1000x512xf32> } - func.func private @Unknown81(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index + func.func private @Unknown80(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32> - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown82(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32> - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown83(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<64x64x3x3xf32> scf.for %arg1 = %c0 to %c36864 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x64x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x64x3x3xf32> } return %alloc : memref<64x64x3x3xf32> } - func.func private @Unknown84(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index + func.func private @Unknown84(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32> - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown85(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c73728 = arith.constant 73728 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<128x64x3x3xf32> scf.for %arg1 = %c0 to %c73728 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x64x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x64x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x64x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x64x3x3xf32> } return %alloc : memref<128x64x3x3xf32> } - func.func private @Unknown86(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c1 = arith.constant 1 : index + func.func private @Unknown85(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + %c1 = arith.constant 1 : index %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c147456 = arith.constant 147456 : index %alloc = memref.alloc() : memref<128x128x3x3xf32> scf.for %arg1 = %c0 to %c147456 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x128x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x128x3x3xf32> } return %alloc : memref<128x128x3x3xf32> } - func.func private @Unknown87(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown86(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<128x64x1x1xf32> scf.for %arg1 = %c0 to %c8192 step %c1 { %0 = arith.remsi %arg1, %c64 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c64 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c64 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<128x64x1x1xf16> - %11 = arith.extf %10 : f16 to f32 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<128x64x1x1xf32> + %1 = arith.divsi %arg1, %c64 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<128x64x1x1xf16> + %3 = arith.extf %2 : f16 to f32 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<128x64x1x1xf32> } return %alloc : memref<128x64x1x1xf32> } - func.func private @Unknown88(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c1 = arith.constant 1 : index + func.func private @Unknown89(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<128x128x3x3xf32> - scf.for %arg1 = %c0 to %c147456 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32> - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown89(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<128x128x3x3xf32> - scf.for %arg1 = %c0 to %c147456 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32> - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown90(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c294912 = arith.constant 294912 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<256x128x3x3xf32> scf.for %arg1 = %c0 to %c294912 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x128x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x128x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x128x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x128x3x3xf32> } return %alloc : memref<256x128x3x3xf32> } - func.func private @Unknown91(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c1 = arith.constant 1 : index + func.func private @Unknown90(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + %c1 = arith.constant 1 : index %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index + %c589824 = arith.constant 589824 : index %alloc = memref.alloc() : memref<256x256x3x3xf32> scf.for %arg1 = %c0 to %c589824 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x256x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x256x3x3xf32> } return %alloc : memref<256x256x3x3xf32> } - func.func private @Unknown92(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown91(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c32768 = arith.constant 32768 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<256x128x1x1xf32> scf.for %arg1 = %c0 to %c32768 step %c1 { %0 = arith.remsi %arg1, %c128 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c128 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c128 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<256x128x1x1xf16> - %11 = arith.extf %10 : f16 to f32 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<256x128x1x1xf32> + %1 = arith.divsi %arg1, %c128 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<256x128x1x1xf16> + %3 = arith.extf %2 : f16 to f32 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<256x128x1x1xf32> } return %alloc : memref<256x128x1x1xf32> } - func.func private @Unknown93(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c1 = arith.constant 1 : index + func.func private @Unknown94(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<256x256x3x3xf32> - scf.for %arg1 = %c0 to %c589824 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32> - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown94(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<256x256x3x3xf32> - scf.for %arg1 = %c0 to %c589824 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32> - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown95(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c1179648 = arith.constant 1179648 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x3x3xf32> scf.for %arg1 = %c0 to %c1179648 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x256x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x256x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x256x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x256x3x3xf32> } return %alloc : memref<512x256x3x3xf32> } - func.func private @Unknown96(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c1 = arith.constant 1 : index + func.func private @Unknown95(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + %c1 = arith.constant 1 : index %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index + %c2359296 = arith.constant 2359296 : index %alloc = memref.alloc() : memref<512x512x3x3xf32> scf.for %arg1 = %c0 to %c2359296 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c512 : index + %5 = arith.divsi %3, %c512 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x512x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x512x3x3xf32> } return %alloc : memref<512x512x3x3xf32> } - func.func private @Unknown97(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown96(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c131072 = arith.constant 131072 : index - %c1 = arith.constant 1 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<512x256x1x1xf32> scf.for %arg1 = %c0 to %c131072 step %c1 { %0 = arith.remsi %arg1, %c256 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c256 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c256 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<512x256x1x1xf16> - %11 = arith.extf %10 : f16 to f32 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<512x256x1x1xf32> + %1 = arith.divsi %arg1, %c256 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<512x256x1x1xf16> + %3 = arith.extf %2 : f16 to f32 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<512x256x1x1xf32> } return %alloc : memref<512x256x1x1xf32> } - func.func private @Unknown98(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<512x512x3x3xf32> - scf.for %arg1 = %c0 to %c2359296 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32> - } - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown99(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<512x512x3x3xf32> - scf.for %arg1 = %c0 to %c2359296 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32> - } - return %alloc : memref<512x512x3x3xf32> - } func.func @main(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: memref<64xf32>, %arg9: memref<64xf32>, %arg10: memref<128xf32>, %arg11: memref<128xf32>, %arg12: memref<128xf32>, %arg13: memref<128xf32>, %arg14: memref<128xf32>, %arg15: memref<128xf32>, %arg16: memref<128xf32>, %arg17: memref<128xf32>, %arg18: memref<128xf32>, %arg19: memref<128xf32>, %arg20: memref<256xf32>, %arg21: memref<256xf32>, %arg22: memref<256xf32>, %arg23: memref<256xf32>, %arg24: memref<256xf32>, %arg25: memref<256xf32>, %arg26: memref<256xf32>, %arg27: memref<256xf32>, %arg28: memref<256xf32>, %arg29: memref<256xf32>, %arg30: memref<512xf32>, %arg31: memref<512xf32>, %arg32: memref<512xf32>, %arg33: memref<512xf32>, %arg34: memref<512xf32>, %arg35: memref<512xf32>, %arg36: memref<512xf32>, %arg37: memref<512xf32>, %arg38: memref<512xf32>, %arg39: memref<512xf32>, %arg40: memref<64xf32>, %arg41: memref<64xf32>, %arg42: memref<64xf32>, %arg43: memref<64xf32>, %arg44: memref<64xf32>, %arg45: memref<64xf32>, %arg46: memref<64xf32>, %arg47: memref<64xf32>, %arg48: memref<64xf32>, %arg49: memref<64xf32>, %arg50: memref<128xf32>, %arg51: memref<128xf32>, %arg52: memref<128xf32>, %arg53: memref<128xf32>, %arg54: memref<128xf32>, %arg55: memref<128xf32>, %arg56: memref<128xf32>, %arg57: memref<128xf32>, %arg58: memref<128xf32>, %arg59: memref<128xf32>, %arg60: memref<256xf32>, %arg61: memref<256xf32>, %arg62: memref<256xf32>, %arg63: memref<256xf32>, %arg64: memref<256xf32>, %arg65: memref<256xf32>, %arg66: memref<256xf32>, %arg67: memref<256xf32>, %arg68: memref<256xf32>, %arg69: memref<256xf32>, %arg70: memref<512xf32>, %arg71: memref<512xf32>, %arg72: memref<512xf32>, %arg73: memref<512xf32>, %arg74: memref<512xf32>, %arg75: memref<512xf32>, %arg76: memref<512xf32>, %arg77: memref<512xf32>, %arg78: memref<512xf32>, %arg79: memref<512xf32>, %arg80: memref<64x3x7x7xf16>, %arg81: memref<1x3x224x224xf16>, %arg82: memref<1x64x112x112xf16>, %arg83: memref<1x64x112x112xf16>, %arg84: memref<1x64x56x56xf16>, %arg85: memref<64x64x3x3xf16>, %arg86: memref<1x64x56x56xf16>, %arg87: memref<1x64x56x56xf16>, %arg88: memref<64x64x3x3xf16>, %arg89: memref<1x64x56x56xf16>, %arg90: memref<1x64x56x56xf16>, %arg91: memref<64x64x3x3xf16>, %arg92: memref<1x64x56x56xf16>, %arg93: memref<1x64x56x56xf16>, %arg94: memref<64x64x3x3xf16>, %arg95: memref<1x64x56x56xf16>, %arg96: memref<1x64x56x56xf16>, %arg97: memref<128x64x3x3xf16>, %arg98: memref<1x128x28x28xf16>, %arg99: memref<1x128x28x28xf16>, %arg100: memref<128x128x3x3xf16>, %arg101: memref<1x128x28x28xf16>, %arg102: memref<128x64x1x1xf16>, %arg103: memref<1x128x28x28xf16>, %arg104: memref<1x128x28x28xf16>, %arg105: memref<128x128x3x3xf16>, %arg106: memref<1x128x28x28xf16>, %arg107: memref<1x128x28x28xf16>, %arg108: memref<128x128x3x3xf16>, %arg109: memref<1x128x28x28xf16>, %arg110: memref<1x128x28x28xf16>, %arg111: memref<256x128x3x3xf16>, %arg112: memref<1x256x14x14xf16>, %arg113: memref<1x256x14x14xf16>, %arg114: memref<256x256x3x3xf16>, %arg115: memref<1x256x14x14xf16>, %arg116: memref<256x128x1x1xf16>, %arg117: memref<1x256x14x14xf16>, %arg118: memref<1x256x14x14xf16>, %arg119: memref<256x256x3x3xf16>, %arg120: memref<1x256x14x14xf16>, %arg121: memref<1x256x14x14xf16>, %arg122: memref<256x256x3x3xf16>, %arg123: memref<1x256x14x14xf16>, %arg124: memref<1x256x14x14xf16>, %arg125: memref<512x256x3x3xf16>, %arg126: memref<1x512x7x7xf16>, %arg127: memref<1x512x7x7xf16>, %arg128: memref<512x512x3x3xf16>, %arg129: memref<1x512x7x7xf16>, %arg130: memref<512x256x1x1xf16>, %arg131: memref<1x512x7x7xf16>, %arg132: memref<1x512x7x7xf16>, %arg133: memref<512x512x3x3xf16>, %arg134: memref<1x512x7x7xf16>, %arg135: memref<1x512x7x7xf16>, %arg136: memref<512x512x3x3xf16>, %arg137: memref<1x512x7x7xf16>, %arg138: memref<1x512x7x7xf16>, %arg139: memref<1x512xf16>, %arg140: memref<512x1000xf16>, %arg141: memref<1x1000xf16>) -> (memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>) attributes {__placeholder__byre.entry_point} { %alloc = memref.alloc() : memref<1x512xf16> byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16>, memref<512x1000xf16>, memref<1x512xf16> @@ -1599,7 +496,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_10, %arg128, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_14 = memref.alloc() : memref<512x512x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %alloc_10, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16> - %3 = call @Unknown12(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %3 = call @Unknown4(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> %alloc_15 = memref.alloc() : memref<1x512x7x7xf16> %alloc_16 = memref.alloc() : memref<512xf32> %alloc_17 = memref.alloc() : memref<512xf32> @@ -1634,7 +531,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_30, %arg119, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_34 = memref.alloc() : memref<256x256x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %alloc_30, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16> - %6 = call @Unknown27(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %6 = call @Unknown19(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %alloc_35 = memref.alloc() : memref<1x256x14x14xf16> %alloc_36 = memref.alloc() : memref<256xf32> %alloc_37 = memref.alloc() : memref<256xf32> @@ -1643,7 +540,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_35, %arg114, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_39 = memref.alloc() : memref<256x256x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %alloc_35, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16> - %7 = call @Unknown31(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %7 = call @Unknown23(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %alloc_40 = memref.alloc() : memref<1x256x14x14xf16> %alloc_41 = memref.alloc() : memref<256xf32> %alloc_42 = memref.alloc() : memref<256xf32> @@ -1678,7 +575,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_55, %arg105, %alloc_58) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_59 = memref.alloc() : memref<128x128x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %alloc_55, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16> - %10 = call @Unknown46(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %10 = call @Unknown38(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %alloc_60 = memref.alloc() : memref<1x128x28x28xf16> %alloc_61 = memref.alloc() : memref<128xf32> %alloc_62 = memref.alloc() : memref<128xf32> @@ -1687,7 +584,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_60, %arg100, %alloc_63) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_64 = memref.alloc() : memref<128x128x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %alloc_60, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16> - %11 = call @Unknown50(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %11 = call @Unknown42(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %alloc_65 = memref.alloc() : memref<1x128x28x28xf16> %alloc_66 = memref.alloc() : memref<128xf32> %alloc_67 = memref.alloc() : memref<128xf32> @@ -1722,7 +619,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_80, %arg91, %alloc_83) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_84 = memref.alloc() : memref<64x64x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %alloc_80, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16> - %14 = call @Unknown65(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %14 = call @Unknown57(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %alloc_85 = memref.alloc() : memref<1x64x56x56xf16> %alloc_86 = memref.alloc() : memref<64xf32> %alloc_87 = memref.alloc() : memref<64xf32> @@ -1731,7 +628,7 @@ module { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_85, %arg88, %alloc_88) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_89 = memref.alloc() : memref<64x64x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %alloc_85, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16> - %15 = call @Unknown69(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %15 = call @Unknown61(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %alloc_90 = memref.alloc() : memref<1x64x56x56xf16> %alloc_91 = memref.alloc() : memref<64xf32> %alloc_92 = memref.alloc() : memref<64xf32> @@ -1751,34 +648,31 @@ module { %alloc_99 = memref.alloc() : memref<64x3x7x7xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %alloc_96, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<64x3x7x7xf16> %18 = call @Unknown77(%alloc_99) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> - %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1x1000xf32> - %alloc_100 = memref.alloc() : memref<1000xf32> - byre.compute @ReduceSumOp_f32_f32(%19, %alloc_100) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32>, memref<1000xf32> - %20 = call @Unknown79(%alloc_100) : (memref<1000xf32>) -> memref<1000xf32> + %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1000xf32> %collapse_shape = memref.collapse_shape %arg141 [[0, 1]] : memref<1x1000xf16> into memref<1000xf16> %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<1000xf16> into memref<1000x1xf16> - %alloc_101 = memref.alloc() : memref<1000x512xf16> - byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_101) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16> - %21 = call @Unknown80(%alloc_101) : (memref<1000x512xf16>) -> memref<1000x512xf32> - %22 = call @Unknown81(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %23 = call @Unknown82(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %24 = call @Unknown83(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %25 = call @Unknown84(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %26 = call @Unknown85(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> - %27 = call @Unknown86(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %28 = call @Unknown87(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> - %29 = call @Unknown88(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %30 = call @Unknown89(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %31 = call @Unknown90(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> - %32 = call @Unknown91(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %33 = call @Unknown92(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> - %34 = call @Unknown93(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %35 = call @Unknown94(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %36 = call @Unknown95(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> - %37 = call @Unknown96(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %38 = call @Unknown97(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> - %39 = call @Unknown98(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %40 = call @Unknown99(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - return %alloc_98, %alloc_97, %18, %20, %21, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %22, %23, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %24, %25, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %26, %27, %28, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %29, %30, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %31, %32, %33, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %34, %35, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %36, %37, %38, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %39, %40 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32> + %alloc_100 = memref.alloc() : memref<1000x512xf16> + byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_100) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16> + %20 = call @Unknown79(%alloc_100) : (memref<1000x512xf16>) -> memref<1000x512xf32> + %21 = call @Unknown80(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %22 = call @Unknown80(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %23 = call @Unknown80(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %24 = call @Unknown80(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %25 = call @Unknown84(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> + %26 = call @Unknown85(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %27 = call @Unknown86(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> + %28 = call @Unknown85(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %29 = call @Unknown85(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %30 = call @Unknown89(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> + %31 = call @Unknown90(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %32 = call @Unknown91(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> + %33 = call @Unknown90(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %34 = call @Unknown90(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %35 = call @Unknown94(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> + %36 = call @Unknown95(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %37 = call @Unknown96(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> + %38 = call @Unknown95(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %39 = call @Unknown95(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + return %alloc_98, %alloc_97, %18, %19, %20, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %21, %22, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %23, %24, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %25, %26, %27, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %28, %29, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %30, %31, %32, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %33, %34, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %35, %36, %37, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %38, %39 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/7_set_space_opt.mlir b/compiler/test/E2E/ResNet18/BW/7_set_space_opt.mlir index 6d46f31c1..043350d4b 100644 --- a/compiler/test/E2E/ResNet18/BW/7_set_space_opt.mlir +++ b/compiler/test/E2E/ResNet18/BW/7_set_space_opt.mlir @@ -1,918 +1,253 @@ -// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s +// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -inline -gpu-launch-func-to-byre -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s // CHECK-LABEL: func.func @main module attributes {gpu.container_module} { gpu.module @unified { - gpu.func @Unknown99(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown98(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown97(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown96(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { %c131072 = arith.constant 131072 : index + %c0 = arith.constant 0 : index %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> } gpu.return } - gpu.func @Unknown96(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown95(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32> } gpu.return } - gpu.func @Unknown95(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown94(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32> - } - gpu.return - } - gpu.func @Unknown94(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32> } gpu.return } - gpu.func @Unknown93(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> - } - gpu.return - } - gpu.func @Unknown92(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown91(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { %c32768 = arith.constant 32768 : index + %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> } gpu.return } - gpu.func @Unknown91(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown90(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> - } - gpu.return - } - gpu.func @Unknown90(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32> } gpu.return } - gpu.func @Unknown89(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown89(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { + %c294912 = arith.constant 294912 : index %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> - } - gpu.return - } - gpu.func @Unknown88(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32> } gpu.return } - gpu.func @Unknown87(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown86(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> } gpu.return } - gpu.func @Unknown86(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown85(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> - } - gpu.return - } - gpu.func @Unknown85(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c73728 = arith.constant 73728 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32> } gpu.return } - gpu.func @Unknown84(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown84(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { + %c73728 = arith.constant 73728 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown83(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32> } gpu.return } - gpu.func @Unknown82(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown80(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown81(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32> } gpu.return } - gpu.func @Unknown80(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown79(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf32> - } - gpu.return - } - gpu.func @Unknown79(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { - %c1000 = arith.constant 1000 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<1000xf32> - %7 = arith.truncf %6 : f32 to f16 - %8 = arith.extf %7 : f16 to f32 - memref.store %8, %arg1[%4] : memref<1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf32> } gpu.return } @@ -924,1144 +259,517 @@ module attributes {gpu.container_module} { %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%c0, %4] : memref<1x1000xf16> - %7 = arith.extf %6 : f16 to f32 - memref.store %7, %arg1[%c0, %4] : memref<1x1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%c0, %arg2] : memref<1x1000xf16> + %8 = arith.extf %7 : f16 to f32 + %9 = arith.truncf %8 : f32 to f16 + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%c0, %arg2] : memref<1x1000xf32> } gpu.return } gpu.func @Unknown77(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { - %c0 = arith.constant 0 : index %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32> } gpu.return } gpu.func @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>, %arg2: memref<1x64x112x112xf16>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x112x112xf16> } gpu.return } gpu.func @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %c0 = arith.constant 0 : index %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - memref.store %28, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.addf %11, %12 : f16 + memref.store %13, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>, %arg3: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant 4.900000e+01 : f16 - %c0 = arith.constant 0 : index %c25088 = arith.constant 25088 : index + %cst = arith.constant 4.900000e+01 : f16 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg0[%c0, %25] : memref<1x512xf16> - %28 = arith.divf %27, %cst_0 : f16 - %29 = arith.cmpf ogt, %26, %cst : f16 - %30 = arith.select %29, %28, %cst : f16 - memref.store %30, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10] : memref<1x512xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.divf %11, %cst : f16 + %14 = arith.cmpf ogt, %12, %cst_0 : f16 + %15 = arith.select %14, %13, %cst_0 : f16 + memref.store %15, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } } - func.func private @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c25 = arith.constant 25 : index %c1 = arith.constant 1 : index - %c196 = arith.constant 196 : index - %alloc = memref.alloc() : memref<1x512x7x7xf16> - gpu.launch_func @unified::@Unknown0 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c196 = arith.constant 196 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - gpu.launch_func @unified::@Unknown4 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) + gpu.launch_func @unified::@Unknown0 blocks in (%c25, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x512xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) return %alloc : memref<1x512x7x7xf16> } - func.func private @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c25 = arith.constant 25 : index %c1 = arith.constant 1 : index - %c196 = arith.constant 196 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - gpu.launch_func @unified::@Unknown8 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %arg2 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) + gpu.launch_func @unified::@Unknown4 blocks in (%c25, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) return %alloc : memref<1x512x7x7xf16> } - func.func private @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c25 = arith.constant 25 : index %c1 = arith.constant 1 : index - %c196 = arith.constant 196 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - gpu.launch_func @unified::@Unknown12 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) + gpu.launch_func @unified::@Unknown8 blocks in (%c25, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %arg2 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) return %alloc : memref<1x512x7x7xf16> } - func.func private @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c49 = arith.constant 49 : index %c1 = arith.constant 1 : index - %c392 = arith.constant 392 : index - %alloc = memref.alloc() : memref<1x256x14x14xf16> - gpu.launch_func @unified::@Unknown19 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %arg2 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c392 = arith.constant 392 : index - %alloc = memref.alloc() : memref<1x256x14x14xf16> - gpu.launch_func @unified::@Unknown23 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown27", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c392 = arith.constant 392 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> - gpu.launch_func @unified::@Unknown27 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %arg2 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) + gpu.launch_func @unified::@Unknown19 blocks in (%c49, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %arg2 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) return %alloc : memref<1x256x14x14xf16> } - func.func private @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown31", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c49 = arith.constant 49 : index %c1 = arith.constant 1 : index - %c392 = arith.constant 392 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> - gpu.launch_func @unified::@Unknown31 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) + gpu.launch_func @unified::@Unknown23 blocks in (%c49, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) return %alloc : memref<1x256x14x14xf16> } - func.func private @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index - %alloc = memref.alloc() : memref<1x128x28x28xf16> - gpu.launch_func @unified::@Unknown38 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %arg2 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown42", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c98 = arith.constant 98 : index %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> - gpu.launch_func @unified::@Unknown42 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) + gpu.launch_func @unified::@Unknown38 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %arg2 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) return %alloc : memref<1x128x28x28xf16> } - func.func private @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown42", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c98 = arith.constant 98 : index %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index - %alloc = memref.alloc() : memref<1x128x28x28xf16> - gpu.launch_func @unified::@Unknown46 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %arg2 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown50", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> - gpu.launch_func @unified::@Unknown50 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) + gpu.launch_func @unified::@Unknown42 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) return %alloc : memref<1x128x28x28xf16> } - func.func private @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index - %alloc = memref.alloc() : memref<1x64x56x56xf16> - gpu.launch_func @unified::@Unknown57 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %arg2 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index - %alloc = memref.alloc() : memref<1x64x56x56xf16> - gpu.launch_func @unified::@Unknown61 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown65", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c196 = arith.constant 196 : index %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - gpu.launch_func @unified::@Unknown65 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %arg2 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) + gpu.launch_func @unified::@Unknown57 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %arg2 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) return %alloc : memref<1x64x56x56xf16> } - func.func private @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c196 = arith.constant 196 : index %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - gpu.launch_func @unified::@Unknown69 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) + gpu.launch_func @unified::@Unknown61 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) return %alloc : memref<1x64x56x56xf16> } - func.func private @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c196 = arith.constant 196 : index %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - gpu.launch_func @unified::@Unknown73 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) + gpu.launch_func @unified::@Unknown73 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) return %alloc : memref<1x64x56x56xf16> } - func.func private @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c784 = arith.constant 784 : index %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x64x112x112xf16> - gpu.launch_func @unified::@Unknown74 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x112x112xf16>, %arg1 : memref<1x64x112x112xf16>, %alloc : memref<1x64x112x112xf16>) + gpu.launch_func @unified::@Unknown74 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x64x112x112xf16>, %arg1 : memref<1x64x112x112xf16>, %alloc : memref<1x64x112x112xf16>) return %alloc : memref<1x64x112x112xf16> } - func.func private @Unknown77(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown77(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c10 = arith.constant 10 : index %c1 = arith.constant 1 : index - %c74 = arith.constant 74 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<64x3x7x7xf32> - gpu.launch_func @unified::@Unknown77 blocks in (%c74, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x3x7x7xf16>, %alloc : memref<64x3x7x7xf32>) + gpu.launch_func @unified::@Unknown77 blocks in (%c10, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<64x3x7x7xf16>, %alloc : memref<64x3x7x7xf32>) return %alloc : memref<64x3x7x7xf32> } - func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1x1000xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1000xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1x1000xf32> - gpu.launch_func @unified::@Unknown78 blocks in (%c8, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x1000xf16>, %alloc : memref<1x1000xf32>) - return %alloc : memref<1x1000xf32> + %collapse_shape = memref.collapse_shape %alloc [[0, 1]] : memref<1x1000xf32> into memref<1000xf32> + gpu.launch_func @unified::@Unknown78 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x1000xf16>, %alloc : memref<1x1000xf32>) + return %collapse_shape : memref<1000xf32> } - func.func private @Unknown79(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown79(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c500 = arith.constant 500 : index %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %alloc = memref.alloc() : memref<1000xf32> - gpu.launch_func @unified::@Unknown79 blocks in (%c8, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000xf32>, %alloc : memref<1000xf32>) - return %alloc : memref<1000xf32> - } - func.func private @Unknown80(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4000 = arith.constant 4000 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1000x512xf32> - gpu.launch_func @unified::@Unknown80 blocks in (%c4000, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000x512xf16>, %alloc : memref<1000x512xf32>) + gpu.launch_func @unified::@Unknown79 blocks in (%c500, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1000x512xf16>, %alloc : memref<1000x512xf32>) return %alloc : memref<1000x512xf32> } - func.func private @Unknown81(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown81", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown80(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c36 = arith.constant 36 : index %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - gpu.launch_func @unified::@Unknown81 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>) - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown82(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - gpu.launch_func @unified::@Unknown82 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>) - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown83(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - gpu.launch_func @unified::@Unknown83 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>) - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown84(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<64x64x3x3xf32> - gpu.launch_func @unified::@Unknown84 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>) + gpu.launch_func @unified::@Unknown80 blocks in (%c36, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>) return %alloc : memref<64x64x3x3xf32> } - func.func private @Unknown85(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown84(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c72 = arith.constant 72 : index %c1 = arith.constant 1 : index - %c576 = arith.constant 576 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<128x64x3x3xf32> - gpu.launch_func @unified::@Unknown85 blocks in (%c576, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x3x3xf16>, %alloc : memref<128x64x3x3xf32>) + gpu.launch_func @unified::@Unknown84 blocks in (%c72, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x64x3x3xf16>, %alloc : memref<128x64x3x3xf32>) return %alloc : memref<128x64x3x3xf32> } - func.func private @Unknown86(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown85(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c144 = arith.constant 144 : index %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<128x128x3x3xf32> - gpu.launch_func @unified::@Unknown86 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>) + gpu.launch_func @unified::@Unknown85 blocks in (%c144, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>) return %alloc : memref<128x128x3x3xf32> } - func.func private @Unknown87(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown86(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c8 = arith.constant 8 : index %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<128x64x1x1xf32> - gpu.launch_func @unified::@Unknown87 blocks in (%c64, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x1x1xf16>, %alloc : memref<128x64x1x1xf32>) + gpu.launch_func @unified::@Unknown86 blocks in (%c8, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x64x1x1xf16>, %alloc : memref<128x64x1x1xf32>) return %alloc : memref<128x64x1x1xf32> } - func.func private @Unknown88(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown88", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index - %alloc = memref.alloc() : memref<128x128x3x3xf32> - gpu.launch_func @unified::@Unknown88 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>) - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown89(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index - %alloc = memref.alloc() : memref<128x128x3x3xf32> - gpu.launch_func @unified::@Unknown89 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>) - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown90(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown89(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c288 = arith.constant 288 : index %c1 = arith.constant 1 : index - %c2304 = arith.constant 2304 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x128x3x3xf32> - gpu.launch_func @unified::@Unknown90 blocks in (%c2304, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x3x3xf16>, %alloc : memref<256x128x3x3xf32>) + gpu.launch_func @unified::@Unknown89 blocks in (%c288, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x128x3x3xf16>, %alloc : memref<256x128x3x3xf32>) return %alloc : memref<256x128x3x3xf32> } - func.func private @Unknown91(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown90(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c576 = arith.constant 576 : index %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x256x3x3xf32> - gpu.launch_func @unified::@Unknown91 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>) + gpu.launch_func @unified::@Unknown90 blocks in (%c576, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>) return %alloc : memref<256x256x3x3xf32> } - func.func private @Unknown92(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown91(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c32 = arith.constant 32 : index %c1 = arith.constant 1 : index %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x128x1x1xf32> - gpu.launch_func @unified::@Unknown92 blocks in (%c256, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x1x1xf16>, %alloc : memref<256x128x1x1xf32>) + gpu.launch_func @unified::@Unknown91 blocks in (%c32, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x128x1x1xf16>, %alloc : memref<256x128x1x1xf32>) return %alloc : memref<256x128x1x1xf32> } - func.func private @Unknown93(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index - %alloc = memref.alloc() : memref<256x256x3x3xf32> - gpu.launch_func @unified::@Unknown93 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>) - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown94(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index - %alloc = memref.alloc() : memref<256x256x3x3xf32> - gpu.launch_func @unified::@Unknown94 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>) - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown95(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown94(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c1152 = arith.constant 1152 : index %c1 = arith.constant 1 : index - %c9216 = arith.constant 9216 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x3x3xf32> - gpu.launch_func @unified::@Unknown95 blocks in (%c9216, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x3x3xf16>, %alloc : memref<512x256x3x3xf32>) + gpu.launch_func @unified::@Unknown94 blocks in (%c1152, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x256x3x3xf16>, %alloc : memref<512x256x3x3xf32>) return %alloc : memref<512x256x3x3xf32> } - func.func private @Unknown96(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown95(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c2304 = arith.constant 2304 : index %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x512x3x3xf32> - gpu.launch_func @unified::@Unknown96 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>) + gpu.launch_func @unified::@Unknown95 blocks in (%c2304, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>) return %alloc : memref<512x512x3x3xf32> } - func.func private @Unknown97(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown97", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + func.func private @Unknown96(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index - %c1024 = arith.constant 1024 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf32> - gpu.launch_func @unified::@Unknown97 blocks in (%c1024, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x1x1xf16>, %alloc : memref<512x256x1x1xf32>) + gpu.launch_func @unified::@Unknown96 blocks in (%c128, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x256x1x1xf16>, %alloc : memref<512x256x1x1xf32>) return %alloc : memref<512x256x1x1xf32> } - func.func private @Unknown98(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown98", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index - %alloc = memref.alloc() : memref<512x512x3x3xf32> - gpu.launch_func @unified::@Unknown98 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>) - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown99(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown99", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index - %alloc = memref.alloc() : memref<512x512x3x3xf32> - gpu.launch_func @unified::@Unknown99 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>) - return %alloc : memref<512x512x3x3xf32> - } func.func @main(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: memref<64xf32>, %arg9: memref<64xf32>, %arg10: memref<128xf32>, %arg11: memref<128xf32>, %arg12: memref<128xf32>, %arg13: memref<128xf32>, %arg14: memref<128xf32>, %arg15: memref<128xf32>, %arg16: memref<128xf32>, %arg17: memref<128xf32>, %arg18: memref<128xf32>, %arg19: memref<128xf32>, %arg20: memref<256xf32>, %arg21: memref<256xf32>, %arg22: memref<256xf32>, %arg23: memref<256xf32>, %arg24: memref<256xf32>, %arg25: memref<256xf32>, %arg26: memref<256xf32>, %arg27: memref<256xf32>, %arg28: memref<256xf32>, %arg29: memref<256xf32>, %arg30: memref<512xf32>, %arg31: memref<512xf32>, %arg32: memref<512xf32>, %arg33: memref<512xf32>, %arg34: memref<512xf32>, %arg35: memref<512xf32>, %arg36: memref<512xf32>, %arg37: memref<512xf32>, %arg38: memref<512xf32>, %arg39: memref<512xf32>, %arg40: memref<64xf32>, %arg41: memref<64xf32>, %arg42: memref<64xf32>, %arg43: memref<64xf32>, %arg44: memref<64xf32>, %arg45: memref<64xf32>, %arg46: memref<64xf32>, %arg47: memref<64xf32>, %arg48: memref<64xf32>, %arg49: memref<64xf32>, %arg50: memref<128xf32>, %arg51: memref<128xf32>, %arg52: memref<128xf32>, %arg53: memref<128xf32>, %arg54: memref<128xf32>, %arg55: memref<128xf32>, %arg56: memref<128xf32>, %arg57: memref<128xf32>, %arg58: memref<128xf32>, %arg59: memref<128xf32>, %arg60: memref<256xf32>, %arg61: memref<256xf32>, %arg62: memref<256xf32>, %arg63: memref<256xf32>, %arg64: memref<256xf32>, %arg65: memref<256xf32>, %arg66: memref<256xf32>, %arg67: memref<256xf32>, %arg68: memref<256xf32>, %arg69: memref<256xf32>, %arg70: memref<512xf32>, %arg71: memref<512xf32>, %arg72: memref<512xf32>, %arg73: memref<512xf32>, %arg74: memref<512xf32>, %arg75: memref<512xf32>, %arg76: memref<512xf32>, %arg77: memref<512xf32>, %arg78: memref<512xf32>, %arg79: memref<512xf32>, %arg80: memref<64x3x7x7xf16>, %arg81: memref<1x3x224x224xf16>, %arg82: memref<1x64x112x112xf16>, %arg83: memref<1x64x112x112xf16>, %arg84: memref<1x64x56x56xf16>, %arg85: memref<64x64x3x3xf16>, %arg86: memref<1x64x56x56xf16>, %arg87: memref<1x64x56x56xf16>, %arg88: memref<64x64x3x3xf16>, %arg89: memref<1x64x56x56xf16>, %arg90: memref<1x64x56x56xf16>, %arg91: memref<64x64x3x3xf16>, %arg92: memref<1x64x56x56xf16>, %arg93: memref<1x64x56x56xf16>, %arg94: memref<64x64x3x3xf16>, %arg95: memref<1x64x56x56xf16>, %arg96: memref<1x64x56x56xf16>, %arg97: memref<128x64x3x3xf16>, %arg98: memref<1x128x28x28xf16>, %arg99: memref<1x128x28x28xf16>, %arg100: memref<128x128x3x3xf16>, %arg101: memref<1x128x28x28xf16>, %arg102: memref<128x64x1x1xf16>, %arg103: memref<1x128x28x28xf16>, %arg104: memref<1x128x28x28xf16>, %arg105: memref<128x128x3x3xf16>, %arg106: memref<1x128x28x28xf16>, %arg107: memref<1x128x28x28xf16>, %arg108: memref<128x128x3x3xf16>, %arg109: memref<1x128x28x28xf16>, %arg110: memref<1x128x28x28xf16>, %arg111: memref<256x128x3x3xf16>, %arg112: memref<1x256x14x14xf16>, %arg113: memref<1x256x14x14xf16>, %arg114: memref<256x256x3x3xf16>, %arg115: memref<1x256x14x14xf16>, %arg116: memref<256x128x1x1xf16>, %arg117: memref<1x256x14x14xf16>, %arg118: memref<1x256x14x14xf16>, %arg119: memref<256x256x3x3xf16>, %arg120: memref<1x256x14x14xf16>, %arg121: memref<1x256x14x14xf16>, %arg122: memref<256x256x3x3xf16>, %arg123: memref<1x256x14x14xf16>, %arg124: memref<1x256x14x14xf16>, %arg125: memref<512x256x3x3xf16>, %arg126: memref<1x512x7x7xf16>, %arg127: memref<1x512x7x7xf16>, %arg128: memref<512x512x3x3xf16>, %arg129: memref<1x512x7x7xf16>, %arg130: memref<512x256x1x1xf16>, %arg131: memref<1x512x7x7xf16>, %arg132: memref<1x512x7x7xf16>, %arg133: memref<512x512x3x3xf16>, %arg134: memref<1x512x7x7xf16>, %arg135: memref<1x512x7x7xf16>, %arg136: memref<512x512x3x3xf16>, %arg137: memref<1x512x7x7xf16>, %arg138: memref<1x512x7x7xf16>, %arg139: memref<1x512xf16>, %arg140: memref<512x1000xf16>, %arg141: memref<1x1000xf16>) -> (memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>) attributes {__placeholder__byre.entry_point} { %alloc = memref.alloc() : memref<1x512xf16> byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16>, memref<512x1000xf16>, memref<1x512xf16> @@ -2092,7 +800,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_10, %arg128, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_14 = memref.alloc() : memref<512x512x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %alloc_10, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16> - %3 = call @Unknown12(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %3 = call @Unknown4(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> %alloc_15 = memref.alloc() : memref<1x512x7x7xf16> %alloc_16 = memref.alloc() : memref<512xf32> %alloc_17 = memref.alloc() : memref<512xf32> @@ -2127,7 +835,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_30, %arg119, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_34 = memref.alloc() : memref<256x256x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %alloc_30, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16> - %6 = call @Unknown27(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %6 = call @Unknown19(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %alloc_35 = memref.alloc() : memref<1x256x14x14xf16> %alloc_36 = memref.alloc() : memref<256xf32> %alloc_37 = memref.alloc() : memref<256xf32> @@ -2136,7 +844,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_35, %arg114, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_39 = memref.alloc() : memref<256x256x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %alloc_35, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16> - %7 = call @Unknown31(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %7 = call @Unknown23(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %alloc_40 = memref.alloc() : memref<1x256x14x14xf16> %alloc_41 = memref.alloc() : memref<256xf32> %alloc_42 = memref.alloc() : memref<256xf32> @@ -2171,7 +879,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_55, %arg105, %alloc_58) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_59 = memref.alloc() : memref<128x128x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %alloc_55, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16> - %10 = call @Unknown46(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %10 = call @Unknown38(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %alloc_60 = memref.alloc() : memref<1x128x28x28xf16> %alloc_61 = memref.alloc() : memref<128xf32> %alloc_62 = memref.alloc() : memref<128xf32> @@ -2180,7 +888,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_60, %arg100, %alloc_63) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_64 = memref.alloc() : memref<128x128x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %alloc_60, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16> - %11 = call @Unknown50(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %11 = call @Unknown42(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %alloc_65 = memref.alloc() : memref<1x128x28x28xf16> %alloc_66 = memref.alloc() : memref<128xf32> %alloc_67 = memref.alloc() : memref<128xf32> @@ -2215,7 +923,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_80, %arg91, %alloc_83) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_84 = memref.alloc() : memref<64x64x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %alloc_80, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16> - %14 = call @Unknown65(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %14 = call @Unknown57(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %alloc_85 = memref.alloc() : memref<1x64x56x56xf16> %alloc_86 = memref.alloc() : memref<64xf32> %alloc_87 = memref.alloc() : memref<64xf32> @@ -2224,7 +932,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_85, %arg88, %alloc_88) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_89 = memref.alloc() : memref<64x64x3x3xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %alloc_85, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16> - %15 = call @Unknown69(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %15 = call @Unknown61(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %alloc_90 = memref.alloc() : memref<1x64x56x56xf16> %alloc_91 = memref.alloc() : memref<64xf32> %alloc_92 = memref.alloc() : memref<64xf32> @@ -2244,34 +952,31 @@ module attributes {gpu.container_module} { %alloc_99 = memref.alloc() : memref<64x3x7x7xf16> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %alloc_96, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<64x3x7x7xf16> %18 = call @Unknown77(%alloc_99) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> - %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1x1000xf32> - %alloc_100 = memref.alloc() : memref<1000xf32> - byre.compute @ReduceSumOp_f32_f32(%19, %alloc_100) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32>, memref<1000xf32> - %20 = call @Unknown79(%alloc_100) : (memref<1000xf32>) -> memref<1000xf32> + %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1000xf32> %collapse_shape = memref.collapse_shape %arg141 [[0, 1]] : memref<1x1000xf16> into memref<1000xf16> %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<1000xf16> into memref<1000x1xf16> - %alloc_101 = memref.alloc() : memref<1000x512xf16> - byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_101) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16> - %21 = call @Unknown80(%alloc_101) : (memref<1000x512xf16>) -> memref<1000x512xf32> - %22 = call @Unknown81(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %23 = call @Unknown82(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %24 = call @Unknown83(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %25 = call @Unknown84(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %26 = call @Unknown85(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> - %27 = call @Unknown86(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %28 = call @Unknown87(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> - %29 = call @Unknown88(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %30 = call @Unknown89(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %31 = call @Unknown90(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> - %32 = call @Unknown91(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %33 = call @Unknown92(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> - %34 = call @Unknown93(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %35 = call @Unknown94(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %36 = call @Unknown95(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> - %37 = call @Unknown96(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %38 = call @Unknown97(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> - %39 = call @Unknown98(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %40 = call @Unknown99(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - return %alloc_98, %alloc_97, %18, %20, %21, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %22, %23, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %24, %25, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %26, %27, %28, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %29, %30, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %31, %32, %33, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %34, %35, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %36, %37, %38, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %39, %40 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32> + %alloc_100 = memref.alloc() : memref<1000x512xf16> + byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_100) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16> + %20 = call @Unknown79(%alloc_100) : (memref<1000x512xf16>) -> memref<1000x512xf32> + %21 = call @Unknown80(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %22 = call @Unknown80(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %23 = call @Unknown80(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %24 = call @Unknown80(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %25 = call @Unknown84(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> + %26 = call @Unknown85(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %27 = call @Unknown86(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> + %28 = call @Unknown85(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %29 = call @Unknown85(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %30 = call @Unknown89(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> + %31 = call @Unknown90(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %32 = call @Unknown91(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> + %33 = call @Unknown90(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %34 = call @Unknown90(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %35 = call @Unknown94(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> + %36 = call @Unknown95(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %37 = call @Unknown96(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> + %38 = call @Unknown95(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %39 = call @Unknown95(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + return %alloc_98, %alloc_97, %18, %19, %20, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %21, %22, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %23, %24, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %25, %26, %27, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %28, %29, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %30, %31, %32, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %33, %34, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %35, %36, %37, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %38, %39 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/8_byre_opt.mlir b/compiler/test/E2E/ResNet18/BW/8_byre_opt.mlir index 67dcb06aa..e22e39fa7 100644 --- a/compiler/test/E2E/ResNet18/BW/8_byre_opt.mlir +++ b/compiler/test/E2E/ResNet18/BW/8_byre_opt.mlir @@ -4,915 +4,250 @@ module attributes {gpu.container_module} { gpu.module @unified { - gpu.func @Unknown99(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown98(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown97(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown96(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { %c131072 = arith.constant 131072 : index + %c0 = arith.constant 0 : index %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> } gpu.return } - gpu.func @Unknown96(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown95(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown95(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32> } gpu.return } - gpu.func @Unknown94(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown94(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { + %c1179648 = arith.constant 1179648 : index %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> - } - gpu.return - } - gpu.func @Unknown93(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32> } gpu.return } - gpu.func @Unknown92(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown91(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { %c32768 = arith.constant 32768 : index + %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> } gpu.return } - gpu.func @Unknown91(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown90(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> - } - gpu.return - } - gpu.func @Unknown90(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32> } gpu.return } - gpu.func @Unknown89(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown89(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { + %c294912 = arith.constant 294912 : index %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> - } - gpu.return - } - gpu.func @Unknown88(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32> } gpu.return } - gpu.func @Unknown87(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown86(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> } gpu.return } - gpu.func @Unknown86(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown85(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32> } gpu.return } - gpu.func @Unknown85(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown84(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown84(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32> } gpu.return } - gpu.func @Unknown83(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown80(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown82(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32> } gpu.return } - gpu.func @Unknown81(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown80(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown79(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf32> - } - gpu.return - } - gpu.func @Unknown79(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { - %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<1000xf32> - %7 = arith.truncf %6 : f32 to f16 - %8 = arith.extf %7 : f16 to f32 - memref.store %8, %arg1[%4] : memref<1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf32> } gpu.return } @@ -924,857 +259,349 @@ module attributes {gpu.container_module} { %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%c0, %4] : memref<1x1000xf16> - %7 = arith.extf %6 : f16 to f32 - memref.store %7, %arg1[%c0, %4] : memref<1x1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%c0, %arg2] : memref<1x1000xf16> + %8 = arith.extf %7 : f16 to f32 + %9 = arith.truncf %8 : f32 to f16 + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%c0, %arg2] : memref<1x1000xf32> } gpu.return } gpu.func @Unknown77(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { - %c0 = arith.constant 0 : index %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32> } gpu.return } gpu.func @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>, %arg2: memref<1x64x112x112xf16>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x112x112xf16> } gpu.return } gpu.func @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - memref.store %28, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.addf %11, %12 : f16 + memref.store %13, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>, %arg3: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant 4.900000e+01 : f16 - %c0 = arith.constant 0 : index %c25088 = arith.constant 25088 : index + %cst = arith.constant 4.900000e+01 : f16 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg0[%c0, %25] : memref<1x512xf16> - %28 = arith.divf %27, %cst_0 : f16 - %29 = arith.cmpf ogt, %26, %cst : f16 - %30 = arith.select %29, %28, %cst : f16 - memref.store %30, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10] : memref<1x512xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.divf %11, %cst : f16 + %14 = arith.cmpf ogt, %12, %cst_0 : f16 + %15 = arith.select %14, %13, %cst_0 : f16 + memref.store %15, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } } - func.func private @Unknown0(memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown4(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown8(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown12(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown19(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown23(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown27(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown27", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown31(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown31", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown38(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown42(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown42", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown46(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown50(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown50", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown57(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown61(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown65(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown65", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown69(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown73(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown74(memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">) -> memref<1x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown77(memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown78(memref<1x1000xf16, "cuda">) -> memref<1x1000xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown79(memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown80(memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown81(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown81", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown82(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown83(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown84(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown85(memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown86(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown87(memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown88(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown88", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown89(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown90(memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown91(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown92(memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown93(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown94(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown95(memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown96(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown97(memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown97", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown98(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown98", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown99(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown99", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown0(memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown4(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown8(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown19(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown23(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown38(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown42(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown42", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown57(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown61(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown73(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown74(memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">) -> memref<1x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown77(memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown78(memref<1x1000xf16, "cuda">) -> memref<1000xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown79(memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown80(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown84(memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown85(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown86(memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown89(memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown90(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown91(memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown94(memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown95(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown96(memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} func.func @main(%arg0: memref<64xf32, "cuda">, %arg1: memref<64xf32, "cuda">, %arg2: memref<64xf32, "cuda">, %arg3: memref<64xf32, "cuda">, %arg4: memref<64xf32, "cuda">, %arg5: memref<64xf32, "cuda">, %arg6: memref<64xf32, "cuda">, %arg7: memref<64xf32, "cuda">, %arg8: memref<64xf32, "cuda">, %arg9: memref<64xf32, "cuda">, %arg10: memref<128xf32, "cuda">, %arg11: memref<128xf32, "cuda">, %arg12: memref<128xf32, "cuda">, %arg13: memref<128xf32, "cuda">, %arg14: memref<128xf32, "cuda">, %arg15: memref<128xf32, "cuda">, %arg16: memref<128xf32, "cuda">, %arg17: memref<128xf32, "cuda">, %arg18: memref<128xf32, "cuda">, %arg19: memref<128xf32, "cuda">, %arg20: memref<256xf32, "cuda">, %arg21: memref<256xf32, "cuda">, %arg22: memref<256xf32, "cuda">, %arg23: memref<256xf32, "cuda">, %arg24: memref<256xf32, "cuda">, %arg25: memref<256xf32, "cuda">, %arg26: memref<256xf32, "cuda">, %arg27: memref<256xf32, "cuda">, %arg28: memref<256xf32, "cuda">, %arg29: memref<256xf32, "cuda">, %arg30: memref<512xf32, "cuda">, %arg31: memref<512xf32, "cuda">, %arg32: memref<512xf32, "cuda">, %arg33: memref<512xf32, "cuda">, %arg34: memref<512xf32, "cuda">, %arg35: memref<512xf32, "cuda">, %arg36: memref<512xf32, "cuda">, %arg37: memref<512xf32, "cuda">, %arg38: memref<512xf32, "cuda">, %arg39: memref<512xf32, "cuda">, %arg40: memref<64xf32, "cuda">, %arg41: memref<64xf32, "cuda">, %arg42: memref<64xf32, "cuda">, %arg43: memref<64xf32, "cuda">, %arg44: memref<64xf32, "cuda">, %arg45: memref<64xf32, "cuda">, %arg46: memref<64xf32, "cuda">, %arg47: memref<64xf32, "cuda">, %arg48: memref<64xf32, "cuda">, %arg49: memref<64xf32, "cuda">, %arg50: memref<128xf32, "cuda">, %arg51: memref<128xf32, "cuda">, %arg52: memref<128xf32, "cuda">, %arg53: memref<128xf32, "cuda">, %arg54: memref<128xf32, "cuda">, %arg55: memref<128xf32, "cuda">, %arg56: memref<128xf32, "cuda">, %arg57: memref<128xf32, "cuda">, %arg58: memref<128xf32, "cuda">, %arg59: memref<128xf32, "cuda">, %arg60: memref<256xf32, "cuda">, %arg61: memref<256xf32, "cuda">, %arg62: memref<256xf32, "cuda">, %arg63: memref<256xf32, "cuda">, %arg64: memref<256xf32, "cuda">, %arg65: memref<256xf32, "cuda">, %arg66: memref<256xf32, "cuda">, %arg67: memref<256xf32, "cuda">, %arg68: memref<256xf32, "cuda">, %arg69: memref<256xf32, "cuda">, %arg70: memref<512xf32, "cuda">, %arg71: memref<512xf32, "cuda">, %arg72: memref<512xf32, "cuda">, %arg73: memref<512xf32, "cuda">, %arg74: memref<512xf32, "cuda">, %arg75: memref<512xf32, "cuda">, %arg76: memref<512xf32, "cuda">, %arg77: memref<512xf32, "cuda">, %arg78: memref<512xf32, "cuda">, %arg79: memref<512xf32, "cuda">, %arg80: memref<64x3x7x7xf16, "cuda">, %arg81: memref<1x3x224x224xf16, "cuda">, %arg82: memref<1x64x112x112xf16, "cuda">, %arg83: memref<1x64x112x112xf16, "cuda">, %arg84: memref<1x64x56x56xf16, "cuda">, %arg85: memref<64x64x3x3xf16, "cuda">, %arg86: memref<1x64x56x56xf16, "cuda">, %arg87: memref<1x64x56x56xf16, "cuda">, %arg88: memref<64x64x3x3xf16, "cuda">, %arg89: memref<1x64x56x56xf16, "cuda">, %arg90: memref<1x64x56x56xf16, "cuda">, %arg91: memref<64x64x3x3xf16, "cuda">, %arg92: memref<1x64x56x56xf16, "cuda">, %arg93: memref<1x64x56x56xf16, "cuda">, %arg94: memref<64x64x3x3xf16, "cuda">, %arg95: memref<1x64x56x56xf16, "cuda">, %arg96: memref<1x64x56x56xf16, "cuda">, %arg97: memref<128x64x3x3xf16, "cuda">, %arg98: memref<1x128x28x28xf16, "cuda">, %arg99: memref<1x128x28x28xf16, "cuda">, %arg100: memref<128x128x3x3xf16, "cuda">, %arg101: memref<1x128x28x28xf16, "cuda">, %arg102: memref<128x64x1x1xf16, "cuda">, %arg103: memref<1x128x28x28xf16, "cuda">, %arg104: memref<1x128x28x28xf16, "cuda">, %arg105: memref<128x128x3x3xf16, "cuda">, %arg106: memref<1x128x28x28xf16, "cuda">, %arg107: memref<1x128x28x28xf16, "cuda">, %arg108: memref<128x128x3x3xf16, "cuda">, %arg109: memref<1x128x28x28xf16, "cuda">, %arg110: memref<1x128x28x28xf16, "cuda">, %arg111: memref<256x128x3x3xf16, "cuda">, %arg112: memref<1x256x14x14xf16, "cuda">, %arg113: memref<1x256x14x14xf16, "cuda">, %arg114: memref<256x256x3x3xf16, "cuda">, %arg115: memref<1x256x14x14xf16, "cuda">, %arg116: memref<256x128x1x1xf16, "cuda">, %arg117: memref<1x256x14x14xf16, "cuda">, %arg118: memref<1x256x14x14xf16, "cuda">, %arg119: memref<256x256x3x3xf16, "cuda">, %arg120: memref<1x256x14x14xf16, "cuda">, %arg121: memref<1x256x14x14xf16, "cuda">, %arg122: memref<256x256x3x3xf16, "cuda">, %arg123: memref<1x256x14x14xf16, "cuda">, %arg124: memref<1x256x14x14xf16, "cuda">, %arg125: memref<512x256x3x3xf16, "cuda">, %arg126: memref<1x512x7x7xf16, "cuda">, %arg127: memref<1x512x7x7xf16, "cuda">, %arg128: memref<512x512x3x3xf16, "cuda">, %arg129: memref<1x512x7x7xf16, "cuda">, %arg130: memref<512x256x1x1xf16, "cuda">, %arg131: memref<1x512x7x7xf16, "cuda">, %arg132: memref<1x512x7x7xf16, "cuda">, %arg133: memref<512x512x3x3xf16, "cuda">, %arg134: memref<1x512x7x7xf16, "cuda">, %arg135: memref<1x512x7x7xf16, "cuda">, %arg136: memref<512x512x3x3xf16, "cuda">, %arg137: memref<1x512x7x7xf16, "cuda">, %arg138: memref<1x512x7x7xf16, "cuda">, %arg139: memref<1x512xf16, "cuda">, %arg140: memref<512x1000xf16, "cuda">, %arg141: memref<1x1000xf16, "cuda">) -> (memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x3x7x7xf32, "cuda">, memref<1000xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda">) attributes {__placeholder__byre.entry_point} { %alloc = memref.alloc() : memref<1x512xf16, "cuda"> byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16, "cuda">, memref<512x1000xf16, "cuda">, memref<1x512xf16, "cuda"> @@ -1805,7 +632,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_10, %arg128, %alloc_13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> %alloc_14 = memref.alloc() : memref<512x512x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %alloc_10, %alloc_14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - %3 = call @Unknown12(%arg127, %alloc_13) : (memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %3 = call @Unknown4(%arg127, %alloc_13) : (memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> %alloc_15 = memref.alloc() : memref<1x512x7x7xf16, "cuda"> %alloc_16 = memref.alloc() : memref<512xf32, "cuda"> %alloc_17 = memref.alloc() : memref<512xf32, "cuda"> @@ -1840,7 +667,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_30, %arg119, %alloc_33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> %alloc_34 = memref.alloc() : memref<256x256x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %alloc_30, %alloc_34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - %6 = call @Unknown27(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %6 = call @Unknown19(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> %alloc_35 = memref.alloc() : memref<1x256x14x14xf16, "cuda"> %alloc_36 = memref.alloc() : memref<256xf32, "cuda"> %alloc_37 = memref.alloc() : memref<256xf32, "cuda"> @@ -1849,7 +676,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_35, %arg114, %alloc_38) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> %alloc_39 = memref.alloc() : memref<256x256x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %alloc_35, %alloc_39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - %7 = call @Unknown31(%arg113, %alloc_38) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %7 = call @Unknown23(%arg113, %alloc_38) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> %alloc_40 = memref.alloc() : memref<1x256x14x14xf16, "cuda"> %alloc_41 = memref.alloc() : memref<256xf32, "cuda"> %alloc_42 = memref.alloc() : memref<256xf32, "cuda"> @@ -1884,7 +711,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_55, %arg105, %alloc_58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> %alloc_59 = memref.alloc() : memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %alloc_55, %alloc_59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %10 = call @Unknown46(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %10 = call @Unknown38(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> %alloc_60 = memref.alloc() : memref<1x128x28x28xf16, "cuda"> %alloc_61 = memref.alloc() : memref<128xf32, "cuda"> %alloc_62 = memref.alloc() : memref<128xf32, "cuda"> @@ -1893,7 +720,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_60, %arg100, %alloc_63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> %alloc_64 = memref.alloc() : memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %alloc_60, %alloc_64) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %11 = call @Unknown50(%arg99, %alloc_63) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %11 = call @Unknown42(%arg99, %alloc_63) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> %alloc_65 = memref.alloc() : memref<1x128x28x28xf16, "cuda"> %alloc_66 = memref.alloc() : memref<128xf32, "cuda"> %alloc_67 = memref.alloc() : memref<128xf32, "cuda"> @@ -1928,7 +755,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_80, %arg91, %alloc_83) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> %alloc_84 = memref.alloc() : memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %alloc_80, %alloc_84) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - %14 = call @Unknown65(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %14 = call @Unknown57(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> %alloc_85 = memref.alloc() : memref<1x64x56x56xf16, "cuda"> %alloc_86 = memref.alloc() : memref<64xf32, "cuda"> %alloc_87 = memref.alloc() : memref<64xf32, "cuda"> @@ -1937,7 +764,7 @@ module attributes {gpu.container_module} { byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_85, %arg88, %alloc_88) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> %alloc_89 = memref.alloc() : memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %alloc_85, %alloc_89) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - %15 = call @Unknown69(%arg87, %alloc_88) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %15 = call @Unknown61(%arg87, %alloc_88) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> %alloc_90 = memref.alloc() : memref<1x64x56x56xf16, "cuda"> %alloc_91 = memref.alloc() : memref<64xf32, "cuda"> %alloc_92 = memref.alloc() : memref<64xf32, "cuda"> @@ -1957,34 +784,31 @@ module attributes {gpu.container_module} { %alloc_99 = memref.alloc() : memref<64x3x7x7xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %alloc_96, %alloc_99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> %18 = call @Unknown77(%alloc_99) : (memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> - %19 = call @Unknown78(%arg141) : (memref<1x1000xf16, "cuda">) -> memref<1x1000xf32, "cuda"> - %alloc_100 = memref.alloc() : memref<1000xf32, "cuda"> - byre.compute @ReduceSumOp_f32_f32(%19, %alloc_100) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32, "cuda">, memref<1000xf32, "cuda"> - %20 = call @Unknown79(%alloc_100) : (memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda"> + %19 = call @Unknown78(%arg141) : (memref<1x1000xf16, "cuda">) -> memref<1000xf32, "cuda"> %collapse_shape = memref.collapse_shape %arg141 [[0, 1]] : memref<1x1000xf16, "cuda"> into memref<1000xf16, "cuda"> %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<1000xf16, "cuda"> into memref<1000x1xf16, "cuda"> - %alloc_101 = memref.alloc() : memref<1000x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_101) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda"> - %21 = call @Unknown80(%alloc_101) : (memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> - %22 = call @Unknown81(%alloc_94) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> - %23 = call @Unknown82(%alloc_89) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> - %24 = call @Unknown83(%alloc_84) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> - %25 = call @Unknown84(%alloc_79) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> - %26 = call @Unknown85(%alloc_69) : (memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> - %27 = call @Unknown86(%alloc_64) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> - %28 = call @Unknown87(%alloc_74) : (memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> - %29 = call @Unknown88(%alloc_59) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> - %30 = call @Unknown89(%alloc_54) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> - %31 = call @Unknown90(%alloc_44) : (memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> - %32 = call @Unknown91(%alloc_39) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> - %33 = call @Unknown92(%alloc_49) : (memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> - %34 = call @Unknown93(%alloc_34) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> - %35 = call @Unknown94(%alloc_29) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> - %36 = call @Unknown95(%alloc_19) : (memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> - %37 = call @Unknown96(%alloc_14) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> - %38 = call @Unknown97(%alloc_24) : (memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> - %39 = call @Unknown98(%alloc_9) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> - %40 = call @Unknown99(%alloc_4) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> - return %alloc_98, %alloc_97, %18, %20, %21, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %22, %23, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %24, %25, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %26, %27, %28, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %29, %30, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %31, %32, %33, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %34, %35, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %36, %37, %38, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %39, %40 : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x3x7x7xf32, "cuda">, memref<1000xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda"> + %alloc_100 = memref.alloc() : memref<1000x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_100) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda"> + %20 = call @Unknown79(%alloc_100) : (memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> + %21 = call @Unknown80(%alloc_94) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> + %22 = call @Unknown80(%alloc_89) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> + %23 = call @Unknown80(%alloc_84) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> + %24 = call @Unknown80(%alloc_79) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> + %25 = call @Unknown84(%alloc_69) : (memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> + %26 = call @Unknown85(%alloc_64) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> + %27 = call @Unknown86(%alloc_74) : (memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> + %28 = call @Unknown85(%alloc_59) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> + %29 = call @Unknown85(%alloc_54) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> + %30 = call @Unknown89(%alloc_44) : (memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> + %31 = call @Unknown90(%alloc_39) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> + %32 = call @Unknown91(%alloc_49) : (memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> + %33 = call @Unknown90(%alloc_34) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> + %34 = call @Unknown90(%alloc_29) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> + %35 = call @Unknown94(%alloc_19) : (memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> + %36 = call @Unknown95(%alloc_14) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> + %37 = call @Unknown96(%alloc_24) : (memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> + %38 = call @Unknown95(%alloc_9) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> + %39 = call @Unknown95(%alloc_4) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> + return %alloc_98, %alloc_97, %18, %19, %20, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %21, %22, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %23, %24, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %25, %26, %27, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %28, %29, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %30, %31, %32, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %33, %34, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %35, %36, %37, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %38, %39 : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x3x7x7xf32, "cuda">, memref<1000xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda"> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/9a_byre_host.mlir b/compiler/test/E2E/ResNet18/BW/9a_byre_host.mlir index 775a491e2..80ac58119 100644 --- a/compiler/test/E2E/ResNet18/BW/9a_byre_host.mlir +++ b/compiler/test/E2E/ResNet18/BW/9a_byre_host.mlir @@ -4,915 +4,250 @@ module attributes {byre.container_module, gpu.container_module} { gpu.module @unified { - gpu.func @Unknown99(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown98(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown97(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown96(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { %c131072 = arith.constant 131072 : index + %c0 = arith.constant 0 : index %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> } gpu.return } - gpu.func @Unknown96(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown95(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown95(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32> } gpu.return } - gpu.func @Unknown94(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown94(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { + %c1179648 = arith.constant 1179648 : index %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> - } - gpu.return - } - gpu.func @Unknown93(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32> } gpu.return } - gpu.func @Unknown92(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown91(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { %c32768 = arith.constant 32768 : index + %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> } gpu.return } - gpu.func @Unknown91(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown90(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> - } - gpu.return - } - gpu.func @Unknown90(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32> } gpu.return } - gpu.func @Unknown89(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown89(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { + %c294912 = arith.constant 294912 : index %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> - } - gpu.return - } - gpu.func @Unknown88(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32> } gpu.return } - gpu.func @Unknown87(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown86(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> } gpu.return } - gpu.func @Unknown86(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown85(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32> } gpu.return } - gpu.func @Unknown85(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown84(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown84(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32> } gpu.return } - gpu.func @Unknown83(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown80(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown82(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32> } gpu.return } - gpu.func @Unknown81(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown80(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown79(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf32> - } - gpu.return - } - gpu.func @Unknown79(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { - %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<1000xf32> - %7 = arith.truncf %6 : f32 to f16 - %8 = arith.extf %7 : f16 to f32 - memref.store %8, %arg1[%4] : memref<1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf32> } gpu.return } @@ -924,973 +259,478 @@ module attributes {byre.container_module, gpu.container_module} { %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%c0, %4] : memref<1x1000xf16> - %7 = arith.extf %6 : f16 to f32 - memref.store %7, %arg1[%c0, %4] : memref<1x1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%c0, %arg2] : memref<1x1000xf16> + %8 = arith.extf %7 : f16 to f32 + %9 = arith.truncf %8 : f32 to f16 + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%c0, %arg2] : memref<1x1000xf32> } gpu.return } gpu.func @Unknown77(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { - %c0 = arith.constant 0 : index %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32> } gpu.return } gpu.func @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>, %arg2: memref<1x64x112x112xf16>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x112x112xf16> } gpu.return } gpu.func @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - memref.store %28, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.addf %11, %12 : f16 + memref.store %13, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>, %arg3: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant 4.900000e+01 : f16 - %c0 = arith.constant 0 : index %c25088 = arith.constant 25088 : index + %cst = arith.constant 4.900000e+01 : f16 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg0[%c0, %25] : memref<1x512xf16> - %28 = arith.divf %27, %cst_0 : f16 - %29 = arith.cmpf ogt, %26, %cst : f16 - %30 = arith.select %29, %28, %cst : f16 - memref.store %30, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10] : memref<1x512xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.divf %11, %cst : f16 + %14 = arith.cmpf ogt, %12, %cst_0 : f16 + %15 = arith.select %14, %13, %cst_0 : f16 + memref.store %15, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } } func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<128xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<128xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<128xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<128xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<128xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<128xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<128xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<256xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<256xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<256xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<256xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<256xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<256xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<256xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<256xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<256xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<256xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<512xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<512xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<512xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<512xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<512xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<512xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<512xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<512xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<512xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<512xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<64xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<64xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<64xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<64xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<64xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<64xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<64xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<64xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<64xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<64xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<128xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<128xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<128xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<128xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<128xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<128xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<128xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<128xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<512xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<512xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<512xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<512xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<512xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<512xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<512xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input123", byre.argtype = 1 : i32}, %arg124: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input124", byre.argtype = 1 : i32}, %arg125: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Input125", byre.argtype = 1 : i32}, %arg126: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input126", byre.argtype = 1 : i32}, %arg127: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input127", byre.argtype = 1 : i32}, %arg128: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input128", byre.argtype = 1 : i32}, %arg129: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input129", byre.argtype = 1 : i32}, %arg130: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Input130", byre.argtype = 1 : i32}, %arg131: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input131", byre.argtype = 1 : i32}, %arg132: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input132", byre.argtype = 1 : i32}, %arg133: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input133", byre.argtype = 1 : i32}, %arg134: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input134", byre.argtype = 1 : i32}, %arg135: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input135", byre.argtype = 1 : i32}, %arg136: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input136", byre.argtype = 1 : i32}, %arg137: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input137", byre.argtype = 1 : i32}, %arg138: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input138", byre.argtype = 1 : i32}, %arg139: memref<1x512xf16, "cuda"> {byre.argname = "Input139", byre.argtype = 1 : i32}, %arg140: memref<512x1000xf16, "cuda"> {byre.argname = "Input140", byre.argtype = 1 : i32}, %arg141: memref<1x1000xf16, "cuda"> {byre.argname = "Input141", byre.argtype = 1 : i32}, %arg142: memref<64xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg143: memref<64xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg144: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg145: memref<1000xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg146: memref<1000x512xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg147: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg148: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg149: memref<64xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg150: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg151: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg152: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg153: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg154: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg155: memref<64xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg156: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg157: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg158: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg159: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg160: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg161: memref<128xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg162: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg163: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg164: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg165: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg166: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg167: memref<128xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg168: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg169: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg170: memref<128xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg171: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg172: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg173: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg174: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg175: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg176: memref<256xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg177: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg178: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg179: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg180: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg181: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg182: memref<256xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg183: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg187: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg188: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg189: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg190: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg191: memref<512xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg192: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg193: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg194: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg195: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg202: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg203: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}) attributes {byre.entry_point} { %alloc = memref.alloc() : memref<25927680xi8, "cuda"> - %0 = "byre.alias"(%alloc) {offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda"> byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16, "cuda">, memref<512x1000xf16, "cuda">, memref<1x512xf16, "cuda"> - %1 = "byre.alias"(%alloc) {offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %2 = "byre.alias"(%alloc) {offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg137, %arg39, %1, %2, %arg201, %arg200) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %3 = "byre.alias"(%alloc) {offset = 16540672 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %3 = "byre.alias"(%alloc) <{offset = 16540672 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%2, %arg136, %3) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %4 = "byre.alias"(%alloc) {offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %4 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg135, %2, %4) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg134, %arg37, %2, %3, %arg199, %arg198) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %5 = "byre.alias"(%alloc) {offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %5 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%3, %arg133, %5) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %6 = "byre.alias"(%alloc) {offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %6 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg132, %3, %6) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - %7 = "byre.alias"(%alloc) {offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %7 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg129, %arg33, %7, %5, %arg192, %arg191) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %8 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %8 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%5, %arg128, %8) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %9 = "byre.alias"(%alloc) {offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %9 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %5, %9) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %10 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %10 = "byre.alias"(%alloc) <{offset = 10970112 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg126, %arg31, %5, %10, %arg190, %arg189) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %11 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %11 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg125, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %12 = "byre.alias"(%alloc) {offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> + %12 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %12) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %10, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %13 = "byre.alias"(%alloc) {offset = 12625920 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg130, %13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %14 = "byre.alias"(%alloc) {offset = 819200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> - %15 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%13, %11, %arg124, %15) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %15, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %16 = "byre.alias"(%alloc) {offset = 11020288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %17 = "byre.alias"(%alloc) {offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg121, %16, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %16, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg119, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %18 = "byre.alias"(%alloc) {offset = 7380992 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %16, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - %19 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%15, %11, %arg118, %19) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown27", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %19, %11, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg114, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %20 = "byre.alias"(%alloc) {offset = 8560640 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %11, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg113, %15, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown31", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %21 = "byre.alias"(%alloc) {offset = 6490112 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %11, %21, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %22 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%21, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %23 = "byre.alias"(%alloc) {offset = 6791168 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %21, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> - %24 = "byre.alias"(%alloc) {offset = 11120640 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %19, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %25 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %13 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %13, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + %14 = "byre.alias"(%alloc) <{offset = 12625920 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%13, %arg130, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %15 = "byre.alias"(%alloc) <{offset = 819200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %13, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> + %16 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%14, %11, %arg124, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %16, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %17 = "byre.alias"(%alloc) <{offset = 11020288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %18 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg121, %17, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %17, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %19 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%17, %arg119, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %20 = "byre.alias"(%alloc) <{offset = 7380992 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %17, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%16, %19, %arg118, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %11, %16, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg114, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %21 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %16, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg113, %14, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %16, %14, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %22 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%14, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %23 = "byre.alias"(%alloc) <{offset = 6791168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %14, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> + %24 = "byre.alias"(%alloc) <{offset = 11120640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %11, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %25 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%24, %arg116, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %26 = "byre.alias"(%alloc) {offset = 311296 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> + %26 = "byre.alias"(%alloc) <{offset = 311296 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %24, %26) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> - %27 = "byre.alias"(%alloc) {offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %27 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg109, %arg19, %27, %25, %arg171, %arg170) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg108, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %28 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %28 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg107, %25, %28) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg106, %arg17, %25, %22, %arg169, %arg168) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%22, %arg105, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %29 = "byre.alias"(%alloc) {offset = 1376256 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %29 = "byre.alias"(%alloc) <{offset = 1376256 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %22, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %30 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %30 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg101, %arg13, %30, %25, %arg162, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg100, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %31 = "byre.alias"(%alloc) {offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %31 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %25, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %32 = "byre.alias"(%alloc) {offset = 6590464 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %32 = "byre.alias"(%alloc) <{offset = 6590464 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg98, %arg11, %25, %32, %arg160, %arg159) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %33 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %33 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%32, %arg97, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %34 = "byre.alias"(%alloc) {offset = 671744 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> + %34 = "byre.alias"(%alloc) <{offset = 671744 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %32, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> - %35 = "byre.alias"(%alloc) {offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %35 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg103, %arg15, %30, %35, %arg167, %arg166) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %36 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %36 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%35, %arg102, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %37 = "byre.alias"(%alloc) {offset = 294912 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> + %37 = "byre.alias"(%alloc) <{offset = 294912 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %35, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> - %38 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + %38 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg95, %arg9, %38, %36, %arg156, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg94, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %39 = "byre.alias"(%alloc) {offset = 376832 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %39 = "byre.alias"(%alloc) <{offset = 598016 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg93, %36, %39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg92, %arg7, %36, %33, %arg154, %arg153) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%33, %arg91, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %40 = "byre.alias"(%alloc) {offset = 524288 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %40 = "byre.alias"(%alloc) <{offset = 524288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %33, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg89, %arg5, %33, %36, %arg150, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %41 = "byre.alias"(%alloc) {offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %41 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg88, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %42 = "byre.alias"(%alloc) {offset = 598016 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %42 = "byre.alias"(%alloc) <{offset = 376832 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %36, %42) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg86, %arg3, %36, %41, %arg148, %arg147) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%41, %arg85, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %43 = "byre.alias"(%alloc) {offset = 450560 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %43 = "byre.alias"(%alloc) <{offset = 450560 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg84, %41, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %44 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + %44 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> byre.compute @PoolMaxGradOp_f16f16_f16(%arg83, %38, %44) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> - %45 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> - byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> + %45 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg82, %arg1, %45, %44, %arg143, %arg142) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %46 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> + %46 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %44, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> - byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> - %47 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x1000xf32, "cuda"> - byre.compute @PTXOp(%arg141, %47) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1x1000xf32, "cuda"> - %48 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000xf32, "cuda"> - byre.compute @ReduceSumOp_f32_f32(%47, %48) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32, "cuda">, memref<1000xf32, "cuda"> - byre.compute @PTXOp(%48, %arg145) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda"> - %49 = "byre.alias"(%arg141) {offset = 0 : i64} : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda"> - %50 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%49, %arg139, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%50, %arg146) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> - byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown81", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> - byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown88", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%20, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> - byre.compute @PTXOp(%18, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%17, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%14, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown97", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> - byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown98", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown99", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> + byre.compute @PTXOp(%arg141, %arg145) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1000xf32, "cuda"> + %47 = "byre.alias"(%arg141) <{offset = 0 : i64}> : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda"> + %48 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%47, %arg139, %48) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%48, %arg146) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> + byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> + byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%21, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> + byre.compute @PTXOp(%20, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%18, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%15, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> + byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> return } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/9b_nvvm_codegen.mlir b/compiler/test/E2E/ResNet18/BW/9b_nvvm_codegen.mlir index e85948d3f..c4a8ef1c1 100644 --- a/compiler/test/E2E/ResNet18/BW/9b_nvvm_codegen.mlir +++ b/compiler/test/E2E/ResNet18/BW/9b_nvvm_codegen.mlir @@ -4,915 +4,250 @@ module attributes {byre.container_module, gpu.container_module} { gpu.module @unified { - gpu.func @Unknown99(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown98(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown97(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown96(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { %c131072 = arith.constant 131072 : index + %c0 = arith.constant 0 : index %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> } gpu.return } - gpu.func @Unknown96(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown95(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> - } - gpu.return - } - gpu.func @Unknown95(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32> } gpu.return } - gpu.func @Unknown94(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown94(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { + %c1179648 = arith.constant 1179648 : index %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> - } - gpu.return - } - gpu.func @Unknown93(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32> } gpu.return } - gpu.func @Unknown92(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown91(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { %c32768 = arith.constant 32768 : index + %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> } gpu.return } - gpu.func @Unknown91(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown90(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> - } - gpu.return - } - gpu.func @Unknown90(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32> } gpu.return } - gpu.func @Unknown89(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown89(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { + %c294912 = arith.constant 294912 : index %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> - } - gpu.return - } - gpu.func @Unknown88(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32> } gpu.return } - gpu.func @Unknown87(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown86(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> } gpu.return } - gpu.func @Unknown86(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown85(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32> } gpu.return } - gpu.func @Unknown85(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown84(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown84(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32> } gpu.return } - gpu.func @Unknown83(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown80(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown82(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32> } gpu.return } - gpu.func @Unknown81(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> - } - gpu.return - } - gpu.func @Unknown80(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown79(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf32> - } - gpu.return - } - gpu.func @Unknown79(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { - %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<1000xf32> - %7 = arith.truncf %6 : f32 to f16 - %8 = arith.extf %7 : f16 to f32 - memref.store %8, %arg1[%4] : memref<1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf32> } gpu.return } @@ -924,973 +259,478 @@ module attributes {byre.container_module, gpu.container_module} { %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%c0, %4] : memref<1x1000xf16> - %7 = arith.extf %6 : f16 to f32 - memref.store %7, %arg1[%c0, %4] : memref<1x1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%c0, %arg2] : memref<1x1000xf16> + %8 = arith.extf %7 : f16 to f32 + %9 = arith.truncf %8 : f32 to f16 + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%c0, %arg2] : memref<1x1000xf32> } gpu.return } gpu.func @Unknown77(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { - %c0 = arith.constant 0 : index %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32> } gpu.return } gpu.func @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>, %arg2: memref<1x64x112x112xf16>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x112x112xf16> } gpu.return } gpu.func @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - memref.store %28, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.addf %11, %12 : f16 + memref.store %13, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>, %arg3: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %29 = arith.addf %27, %28 : f16 - %30 = arith.cmpf ogt, %26, %cst : f16 - %31 = arith.select %30, %29, %cst : f16 - memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %14 = arith.addf %11, %12 : f16 + %15 = arith.cmpf ogt, %13, %cst : f16 + %16 = arith.select %15, %14, %cst : f16 + memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.cmpf ogt, %26, %cst : f16 - %29 = arith.select %28, %27, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.cmpf ogt, %11, %cst : f16 + %14 = arith.select %13, %12, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant 4.900000e+01 : f16 - %c0 = arith.constant 0 : index %c25088 = arith.constant 25088 : index + %cst = arith.constant 4.900000e+01 : f16 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg0[%c0, %25] : memref<1x512xf16> - %28 = arith.divf %27, %cst_0 : f16 - %29 = arith.cmpf ogt, %26, %cst : f16 - %30 = arith.select %29, %28, %cst : f16 - memref.store %30, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10] : memref<1x512xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.divf %11, %cst : f16 + %14 = arith.cmpf ogt, %12, %cst_0 : f16 + %15 = arith.select %14, %13, %cst_0 : f16 + memref.store %15, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } } func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<128xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<128xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<128xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<128xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<128xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<128xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<128xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<256xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<256xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<256xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<256xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<256xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<256xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<256xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<256xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<256xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<256xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<512xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<512xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<512xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<512xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<512xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<512xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<512xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<512xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<512xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<512xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<64xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<64xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<64xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<64xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<64xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<64xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<64xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<64xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<64xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<64xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<128xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<128xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<128xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<128xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<128xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<128xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<128xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<128xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<512xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<512xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<512xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<512xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<512xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<512xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<512xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input123", byre.argtype = 1 : i32}, %arg124: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input124", byre.argtype = 1 : i32}, %arg125: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Input125", byre.argtype = 1 : i32}, %arg126: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input126", byre.argtype = 1 : i32}, %arg127: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input127", byre.argtype = 1 : i32}, %arg128: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input128", byre.argtype = 1 : i32}, %arg129: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input129", byre.argtype = 1 : i32}, %arg130: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Input130", byre.argtype = 1 : i32}, %arg131: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input131", byre.argtype = 1 : i32}, %arg132: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input132", byre.argtype = 1 : i32}, %arg133: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input133", byre.argtype = 1 : i32}, %arg134: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input134", byre.argtype = 1 : i32}, %arg135: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input135", byre.argtype = 1 : i32}, %arg136: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input136", byre.argtype = 1 : i32}, %arg137: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input137", byre.argtype = 1 : i32}, %arg138: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input138", byre.argtype = 1 : i32}, %arg139: memref<1x512xf16, "cuda"> {byre.argname = "Input139", byre.argtype = 1 : i32}, %arg140: memref<512x1000xf16, "cuda"> {byre.argname = "Input140", byre.argtype = 1 : i32}, %arg141: memref<1x1000xf16, "cuda"> {byre.argname = "Input141", byre.argtype = 1 : i32}, %arg142: memref<64xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg143: memref<64xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg144: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg145: memref<1000xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg146: memref<1000x512xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg147: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg148: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg149: memref<64xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg150: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg151: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg152: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg153: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg154: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg155: memref<64xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg156: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg157: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg158: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg159: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg160: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg161: memref<128xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg162: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg163: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg164: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg165: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg166: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg167: memref<128xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg168: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg169: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg170: memref<128xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg171: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg172: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg173: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg174: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg175: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg176: memref<256xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg177: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg178: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg179: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg180: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg181: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg182: memref<256xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg183: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg187: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg188: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg189: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg190: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg191: memref<512xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg192: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg193: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg194: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg195: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg202: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg203: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}) attributes {byre.entry_point} { %alloc = memref.alloc() : memref<25927680xi8, "cuda"> - %0 = "byre.alias"(%alloc) {offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda"> byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16, "cuda">, memref<512x1000xf16, "cuda">, memref<1x512xf16, "cuda"> - %1 = "byre.alias"(%alloc) {offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %2 = "byre.alias"(%alloc) {offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg137, %arg39, %1, %2, %arg201, %arg200) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %3 = "byre.alias"(%alloc) {offset = 16540672 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %3 = "byre.alias"(%alloc) <{offset = 16540672 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%2, %arg136, %3) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %4 = "byre.alias"(%alloc) {offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %4 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg135, %2, %4) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg134, %arg37, %2, %3, %arg199, %arg198) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %5 = "byre.alias"(%alloc) {offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %5 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%3, %arg133, %5) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %6 = "byre.alias"(%alloc) {offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %6 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg132, %3, %6) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - %7 = "byre.alias"(%alloc) {offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %7 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg129, %arg33, %7, %5, %arg192, %arg191) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %8 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %8 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%5, %arg128, %8) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %9 = "byre.alias"(%alloc) {offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %9 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %5, %9) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %10 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %10 = "byre.alias"(%alloc) <{offset = 10970112 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg126, %arg31, %5, %10, %arg190, %arg189) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %11 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %11 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg125, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %12 = "byre.alias"(%alloc) {offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> + %12 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %12) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %10, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %13 = "byre.alias"(%alloc) {offset = 12625920 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg130, %13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %14 = "byre.alias"(%alloc) {offset = 819200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> - %15 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%13, %11, %arg124, %15) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %15, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %16 = "byre.alias"(%alloc) {offset = 11020288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %17 = "byre.alias"(%alloc) {offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg121, %16, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %16, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg119, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %18 = "byre.alias"(%alloc) {offset = 7380992 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %16, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - %19 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%15, %11, %arg118, %19) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown27", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %19, %11, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg114, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %20 = "byre.alias"(%alloc) {offset = 8560640 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %11, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg113, %15, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown31", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %21 = "byre.alias"(%alloc) {offset = 6490112 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %11, %21, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %22 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%21, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %23 = "byre.alias"(%alloc) {offset = 6791168 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %21, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> - %24 = "byre.alias"(%alloc) {offset = 11120640 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %19, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %25 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %13 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %13, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + %14 = "byre.alias"(%alloc) <{offset = 12625920 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%13, %arg130, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %15 = "byre.alias"(%alloc) <{offset = 819200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %13, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> + %16 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%14, %11, %arg124, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %16, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %17 = "byre.alias"(%alloc) <{offset = 11020288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %18 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg121, %17, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %17, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %19 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%17, %arg119, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %20 = "byre.alias"(%alloc) <{offset = 7380992 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %17, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%16, %19, %arg118, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %11, %16, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg114, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %21 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %16, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg113, %14, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %16, %14, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %22 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%14, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %23 = "byre.alias"(%alloc) <{offset = 6791168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %14, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> + %24 = "byre.alias"(%alloc) <{offset = 11120640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %11, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %25 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%24, %arg116, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %26 = "byre.alias"(%alloc) {offset = 311296 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> + %26 = "byre.alias"(%alloc) <{offset = 311296 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %24, %26) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> - %27 = "byre.alias"(%alloc) {offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %27 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg109, %arg19, %27, %25, %arg171, %arg170) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg108, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %28 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %28 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg107, %25, %28) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg106, %arg17, %25, %22, %arg169, %arg168) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%22, %arg105, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %29 = "byre.alias"(%alloc) {offset = 1376256 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %29 = "byre.alias"(%alloc) <{offset = 1376256 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %22, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %30 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %30 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg101, %arg13, %30, %25, %arg162, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg100, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %31 = "byre.alias"(%alloc) {offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %31 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %25, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %32 = "byre.alias"(%alloc) {offset = 6590464 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %32 = "byre.alias"(%alloc) <{offset = 6590464 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg98, %arg11, %25, %32, %arg160, %arg159) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %33 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %33 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%32, %arg97, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %34 = "byre.alias"(%alloc) {offset = 671744 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> + %34 = "byre.alias"(%alloc) <{offset = 671744 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %32, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> - %35 = "byre.alias"(%alloc) {offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %35 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg103, %arg15, %30, %35, %arg167, %arg166) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %36 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %36 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%35, %arg102, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %37 = "byre.alias"(%alloc) {offset = 294912 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> + %37 = "byre.alias"(%alloc) <{offset = 294912 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %35, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> - %38 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + %38 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg95, %arg9, %38, %36, %arg156, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg94, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %39 = "byre.alias"(%alloc) {offset = 376832 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %39 = "byre.alias"(%alloc) <{offset = 598016 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg93, %36, %39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg92, %arg7, %36, %33, %arg154, %arg153) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%33, %arg91, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %40 = "byre.alias"(%alloc) {offset = 524288 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %40 = "byre.alias"(%alloc) <{offset = 524288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %33, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg89, %arg5, %33, %36, %arg150, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %41 = "byre.alias"(%alloc) {offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %41 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg88, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %42 = "byre.alias"(%alloc) {offset = 598016 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %42 = "byre.alias"(%alloc) <{offset = 376832 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %36, %42) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg86, %arg3, %36, %41, %arg148, %arg147) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%41, %arg85, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %43 = "byre.alias"(%alloc) {offset = 450560 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %43 = "byre.alias"(%alloc) <{offset = 450560 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg84, %41, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %44 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + %44 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> byre.compute @PoolMaxGradOp_f16f16_f16(%arg83, %38, %44) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> - %45 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> - byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> + %45 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg82, %arg1, %45, %44, %arg143, %arg142) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %46 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> + %46 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %44, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> - byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> - %47 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x1000xf32, "cuda"> - byre.compute @PTXOp(%arg141, %47) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1x1000xf32, "cuda"> - %48 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000xf32, "cuda"> - byre.compute @ReduceSumOp_f32_f32(%47, %48) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32, "cuda">, memref<1000xf32, "cuda"> - byre.compute @PTXOp(%48, %arg145) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda"> - %49 = "byre.alias"(%arg141) {offset = 0 : i64} : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda"> - %50 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%49, %arg139, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%50, %arg146) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> - byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown81", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> - byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown88", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%20, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> - byre.compute @PTXOp(%18, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%17, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%14, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown97", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> - byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown98", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown99", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> + byre.compute @PTXOp(%arg141, %arg145) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1000xf32, "cuda"> + %47 = "byre.alias"(%arg141) <{offset = 0 : i64}> : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda"> + %48 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%47, %arg139, %48) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%48, %arg146) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> + byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> + byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%21, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> + byre.compute @PTXOp(%20, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%18, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%15, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> + byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> return } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/BW/device_output.ptx b/compiler/test/E2E/ResNet18/BW/device_output.ptx index 5b5b9aea4..8c22562a6 100644 --- a/compiler/test/E2E/ResNet18/BW/device_output.ptx +++ b/compiler/test/E2E/ResNet18/BW/device_output.ptx @@ -6,285 +6,8 @@ .target sm_70 .address_size 64 - // .globl Unknown99 - -.visible .entry Unknown99( - .param .u64 Unknown99_param_0, - .param .u64 Unknown99_param_1, - .param .u64 Unknown99_param_2, - .param .u64 Unknown99_param_3, - .param .u64 Unknown99_param_4, - .param .u64 Unknown99_param_5, - .param .u64 Unknown99_param_6, - .param .u64 Unknown99_param_7, - .param .u64 Unknown99_param_8, - .param .u64 Unknown99_param_9, - .param .u64 Unknown99_param_10, - .param .u64 Unknown99_param_11, - .param .u64 Unknown99_param_12, - .param .u64 Unknown99_param_13, - .param .u64 Unknown99_param_14, - .param .u64 Unknown99_param_15, - .param .u64 Unknown99_param_16, - .param .u64 Unknown99_param_17, - .param .u64 Unknown99_param_18, - .param .u64 Unknown99_param_19, - .param .u64 Unknown99_param_20, - .param .u64 Unknown99_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 2359295; - @%p1 bra $L__BB0_2; - ld.param.u64 %rd4, [Unknown99_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown99_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 55; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -512; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 512; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 55; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 9; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 4608; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB0_2: - ret; - -} - // .globl Unknown98 -.visible .entry Unknown98( - .param .u64 Unknown98_param_0, - .param .u64 Unknown98_param_1, - .param .u64 Unknown98_param_2, - .param .u64 Unknown98_param_3, - .param .u64 Unknown98_param_4, - .param .u64 Unknown98_param_5, - .param .u64 Unknown98_param_6, - .param .u64 Unknown98_param_7, - .param .u64 Unknown98_param_8, - .param .u64 Unknown98_param_9, - .param .u64 Unknown98_param_10, - .param .u64 Unknown98_param_11, - .param .u64 Unknown98_param_12, - .param .u64 Unknown98_param_13, - .param .u64 Unknown98_param_14, - .param .u64 Unknown98_param_15, - .param .u64 Unknown98_param_16, - .param .u64 Unknown98_param_17, - .param .u64 Unknown98_param_18, - .param .u64 Unknown98_param_19, - .param .u64 Unknown98_param_20, - .param .u64 Unknown98_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 2359295; - @%p1 bra $L__BB1_2; - ld.param.u64 %rd4, [Unknown98_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown98_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 55; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -512; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 512; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 55; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 9; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 4608; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB1_2: - ret; - -} - // .globl Unknown97 -.visible .entry Unknown97( - .param .u64 Unknown97_param_0, - .param .u64 Unknown97_param_1, - .param .u64 Unknown97_param_2, - .param .u64 Unknown97_param_3, - .param .u64 Unknown97_param_4, - .param .u64 Unknown97_param_5, - .param .u64 Unknown97_param_6, - .param .u64 Unknown97_param_7, - .param .u64 Unknown97_param_8, - .param .u64 Unknown97_param_9, - .param .u64 Unknown97_param_10, - .param .u64 Unknown97_param_11, - .param .u64 Unknown97_param_12, - .param .u64 Unknown97_param_13, - .param .u64 Unknown97_param_14, - .param .u64 Unknown97_param_15, - .param .u64 Unknown97_param_16, - .param .u64 Unknown97_param_17, - .param .u64 Unknown97_param_18, - .param .u64 Unknown97_param_19, - .param .u64 Unknown97_param_20, - .param .u64 Unknown97_param_21 -) -{ - .reg .pred %p<3>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<27>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 131071; - @%p1 bra $L__BB2_2; - ld.param.u64 %rd4, [Unknown97_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown97_param_1]; - cvta.to.global.u64 %rd2, %rd5; - shr.s64 %rd8, %rd3, 63; - shr.u64 %rd9, %rd8, 56; - add.s64 %rd10, %rd3, %rd9; - and.b64 %rd11, %rd10, -256; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 256; - selp.b64 %rd14, %rd13, %rd12, %p2; - xor.b64 %rd15, %rd8, %rd3; - shr.s64 %rd16, %rd15, 63; - shr.u64 %rd17, %rd16, 56; - add.s64 %rd18, %rd15, %rd17; - shr.u64 %rd19, %rd18, 8; - xor.b64 %rd20, %rd19, %rd8; - shl.b64 %rd21, %rd20, 8; - add.s64 %rd22, %rd21, %rd14; - shl.b64 %rd23, %rd22, 1; - add.s64 %rd24, %rd2, %rd23; - ld.global.b16 %h1, [%rd24]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd25, %rd22, 2; - add.s64 %rd26, %rd1, %rd25; - st.global.f32 [%rd26], %f1; -$L__BB2_2: - ret; - -} // .globl Unknown96 + .visible .entry Unknown96( .param .u64 Unknown96_param_0, .param .u64 Unknown96_param_1, @@ -310,80 +33,42 @@ $L__BB2_2: .param .u64 Unknown96_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 2359295; - @%p1 bra $L__BB3_2; - ld.param.u64 %rd4, [Unknown96_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown96_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 55; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -512; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 512; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 55; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 9; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 4608; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB3_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 131071; + @%p1 bra $L__BB0_3; + ld.param.u64 %rd15, [Unknown96_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown96_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB0_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 131072; + @%p2 bra $L__BB0_2; +$L__BB0_3: ret; } @@ -413,80 +98,42 @@ $L__BB3_2: .param .u64 Unknown95_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 1179647; - @%p1 bra $L__BB4_2; - ld.param.u64 %rd4, [Unknown95_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown95_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 56; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -256; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 256; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 56; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 8; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 2304; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB4_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 2359295; + @%p1 bra $L__BB1_3; + ld.param.u64 %rd15, [Unknown95_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown95_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB1_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 2359296; + @%p2 bra $L__BB1_2; +$L__BB1_3: ret; } @@ -515,255 +162,43 @@ $L__BB4_2: .param .u64 Unknown94_param_20, .param .u64 Unknown94_param_21 ) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 589823; - @%p1 bra $L__BB5_2; - ld.param.u64 %rd4, [Unknown94_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown94_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 56; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -256; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 256; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 56; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 8; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 2304; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB5_2: - ret; - -} - // .globl Unknown93 -.visible .entry Unknown93( - .param .u64 Unknown93_param_0, - .param .u64 Unknown93_param_1, - .param .u64 Unknown93_param_2, - .param .u64 Unknown93_param_3, - .param .u64 Unknown93_param_4, - .param .u64 Unknown93_param_5, - .param .u64 Unknown93_param_6, - .param .u64 Unknown93_param_7, - .param .u64 Unknown93_param_8, - .param .u64 Unknown93_param_9, - .param .u64 Unknown93_param_10, - .param .u64 Unknown93_param_11, - .param .u64 Unknown93_param_12, - .param .u64 Unknown93_param_13, - .param .u64 Unknown93_param_14, - .param .u64 Unknown93_param_15, - .param .u64 Unknown93_param_16, - .param .u64 Unknown93_param_17, - .param .u64 Unknown93_param_18, - .param .u64 Unknown93_param_19, - .param .u64 Unknown93_param_20, - .param .u64 Unknown93_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 589823; - @%p1 bra $L__BB6_2; - ld.param.u64 %rd4, [Unknown93_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown93_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 56; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -256; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 256; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 56; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 8; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 2304; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB6_2: - ret; - -} - // .globl Unknown92 -.visible .entry Unknown92( - .param .u64 Unknown92_param_0, - .param .u64 Unknown92_param_1, - .param .u64 Unknown92_param_2, - .param .u64 Unknown92_param_3, - .param .u64 Unknown92_param_4, - .param .u64 Unknown92_param_5, - .param .u64 Unknown92_param_6, - .param .u64 Unknown92_param_7, - .param .u64 Unknown92_param_8, - .param .u64 Unknown92_param_9, - .param .u64 Unknown92_param_10, - .param .u64 Unknown92_param_11, - .param .u64 Unknown92_param_12, - .param .u64 Unknown92_param_13, - .param .u64 Unknown92_param_14, - .param .u64 Unknown92_param_15, - .param .u64 Unknown92_param_16, - .param .u64 Unknown92_param_17, - .param .u64 Unknown92_param_18, - .param .u64 Unknown92_param_19, - .param .u64 Unknown92_param_20, - .param .u64 Unknown92_param_21 -) { .reg .pred %p<3>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<27>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 32767; - @%p1 bra $L__BB7_2; - ld.param.u64 %rd4, [Unknown92_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown92_param_1]; - cvta.to.global.u64 %rd2, %rd5; - shr.s64 %rd8, %rd3, 63; - shr.u64 %rd9, %rd8, 57; - add.s64 %rd10, %rd3, %rd9; - and.b64 %rd11, %rd10, -128; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 128; - selp.b64 %rd14, %rd13, %rd12, %p2; - xor.b64 %rd15, %rd8, %rd3; - shr.s64 %rd16, %rd15, 63; - shr.u64 %rd17, %rd16, 57; - add.s64 %rd18, %rd15, %rd17; - shr.u64 %rd19, %rd18, 7; - xor.b64 %rd20, %rd19, %rd8; - shl.b64 %rd21, %rd20, 7; - add.s64 %rd22, %rd21, %rd14; - shl.b64 %rd23, %rd22, 1; - add.s64 %rd24, %rd2, %rd23; - ld.global.b16 %h1, [%rd24]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd25, %rd22, 2; - add.s64 %rd26, %rd1, %rd25; - st.global.f32 [%rd26], %f1; -$L__BB7_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 1179647; + @%p1 bra $L__BB2_3; + ld.param.u64 %rd15, [Unknown94_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown94_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB2_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 1179648; + @%p2 bra $L__BB2_2; +$L__BB2_3: ret; } @@ -793,80 +228,42 @@ $L__BB7_2: .param .u64 Unknown91_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 589823; - @%p1 bra $L__BB8_2; - ld.param.u64 %rd4, [Unknown91_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown91_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 56; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -256; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 256; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 56; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 8; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 2304; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB8_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 32767; + @%p1 bra $L__BB3_3; + ld.param.u64 %rd15, [Unknown91_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown91_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB3_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 32768; + @%p2 bra $L__BB3_2; +$L__BB3_3: ret; } @@ -896,80 +293,42 @@ $L__BB8_2: .param .u64 Unknown90_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 294911; - @%p1 bra $L__BB9_2; - ld.param.u64 %rd4, [Unknown90_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown90_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 57; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -128; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 128; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 57; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 7; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 1152; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB9_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 589823; + @%p1 bra $L__BB4_3; + ld.param.u64 %rd15, [Unknown90_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown90_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB4_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 589824; + @%p2 bra $L__BB4_2; +$L__BB4_3: ret; } @@ -998,255 +357,43 @@ $L__BB9_2: .param .u64 Unknown89_param_20, .param .u64 Unknown89_param_21 ) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 147455; - @%p1 bra $L__BB10_2; - ld.param.u64 %rd4, [Unknown89_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown89_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 57; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -128; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 128; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 57; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 7; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 1152; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB10_2: - ret; - -} - // .globl Unknown88 -.visible .entry Unknown88( - .param .u64 Unknown88_param_0, - .param .u64 Unknown88_param_1, - .param .u64 Unknown88_param_2, - .param .u64 Unknown88_param_3, - .param .u64 Unknown88_param_4, - .param .u64 Unknown88_param_5, - .param .u64 Unknown88_param_6, - .param .u64 Unknown88_param_7, - .param .u64 Unknown88_param_8, - .param .u64 Unknown88_param_9, - .param .u64 Unknown88_param_10, - .param .u64 Unknown88_param_11, - .param .u64 Unknown88_param_12, - .param .u64 Unknown88_param_13, - .param .u64 Unknown88_param_14, - .param .u64 Unknown88_param_15, - .param .u64 Unknown88_param_16, - .param .u64 Unknown88_param_17, - .param .u64 Unknown88_param_18, - .param .u64 Unknown88_param_19, - .param .u64 Unknown88_param_20, - .param .u64 Unknown88_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 147455; - @%p1 bra $L__BB11_2; - ld.param.u64 %rd4, [Unknown88_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown88_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 57; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -128; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 128; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 57; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 7; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 1152; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB11_2: - ret; - -} - // .globl Unknown87 -.visible .entry Unknown87( - .param .u64 Unknown87_param_0, - .param .u64 Unknown87_param_1, - .param .u64 Unknown87_param_2, - .param .u64 Unknown87_param_3, - .param .u64 Unknown87_param_4, - .param .u64 Unknown87_param_5, - .param .u64 Unknown87_param_6, - .param .u64 Unknown87_param_7, - .param .u64 Unknown87_param_8, - .param .u64 Unknown87_param_9, - .param .u64 Unknown87_param_10, - .param .u64 Unknown87_param_11, - .param .u64 Unknown87_param_12, - .param .u64 Unknown87_param_13, - .param .u64 Unknown87_param_14, - .param .u64 Unknown87_param_15, - .param .u64 Unknown87_param_16, - .param .u64 Unknown87_param_17, - .param .u64 Unknown87_param_18, - .param .u64 Unknown87_param_19, - .param .u64 Unknown87_param_20, - .param .u64 Unknown87_param_21 -) { .reg .pred %p<3>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<27>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 8191; - @%p1 bra $L__BB12_2; - ld.param.u64 %rd4, [Unknown87_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown87_param_1]; - cvta.to.global.u64 %rd2, %rd5; - shr.s64 %rd8, %rd3, 63; - shr.u64 %rd9, %rd8, 58; - add.s64 %rd10, %rd3, %rd9; - and.b64 %rd11, %rd10, -64; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 64; - selp.b64 %rd14, %rd13, %rd12, %p2; - xor.b64 %rd15, %rd8, %rd3; - shr.s64 %rd16, %rd15, 63; - shr.u64 %rd17, %rd16, 58; - add.s64 %rd18, %rd15, %rd17; - shr.u64 %rd19, %rd18, 6; - xor.b64 %rd20, %rd19, %rd8; - shl.b64 %rd21, %rd20, 6; - add.s64 %rd22, %rd21, %rd14; - shl.b64 %rd23, %rd22, 1; - add.s64 %rd24, %rd2, %rd23; - ld.global.b16 %h1, [%rd24]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd25, %rd22, 2; - add.s64 %rd26, %rd1, %rd25; - st.global.f32 [%rd26], %f1; -$L__BB12_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 294911; + @%p1 bra $L__BB5_3; + ld.param.u64 %rd15, [Unknown89_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown89_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB5_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 294912; + @%p2 bra $L__BB5_2; +$L__BB5_3: ret; } @@ -1276,80 +423,42 @@ $L__BB12_2: .param .u64 Unknown86_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 147455; - @%p1 bra $L__BB13_2; - ld.param.u64 %rd4, [Unknown86_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown86_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 57; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -128; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 128; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 57; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 7; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 1152; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB13_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 8191; + @%p1 bra $L__BB6_3; + ld.param.u64 %rd15, [Unknown86_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown86_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB6_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 8192; + @%p2 bra $L__BB6_2; +$L__BB6_3: ret; } @@ -1379,80 +488,42 @@ $L__BB13_2: .param .u64 Unknown85_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 73727; - @%p1 bra $L__BB14_2; - ld.param.u64 %rd4, [Unknown85_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown85_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 58; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -64; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 64; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 58; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 6; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 576; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB14_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 147455; + @%p1 bra $L__BB7_3; + ld.param.u64 %rd15, [Unknown85_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown85_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB7_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 147456; + @%p2 bra $L__BB7_2; +$L__BB7_3: ret; } @@ -1482,389 +553,42 @@ $L__BB14_2: .param .u64 Unknown84_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 36863; - @%p1 bra $L__BB15_2; - ld.param.u64 %rd4, [Unknown84_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown84_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 58; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -64; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 64; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 58; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 6; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 576; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB15_2: - ret; - -} - // .globl Unknown83 -.visible .entry Unknown83( - .param .u64 Unknown83_param_0, - .param .u64 Unknown83_param_1, - .param .u64 Unknown83_param_2, - .param .u64 Unknown83_param_3, - .param .u64 Unknown83_param_4, - .param .u64 Unknown83_param_5, - .param .u64 Unknown83_param_6, - .param .u64 Unknown83_param_7, - .param .u64 Unknown83_param_8, - .param .u64 Unknown83_param_9, - .param .u64 Unknown83_param_10, - .param .u64 Unknown83_param_11, - .param .u64 Unknown83_param_12, - .param .u64 Unknown83_param_13, - .param .u64 Unknown83_param_14, - .param .u64 Unknown83_param_15, - .param .u64 Unknown83_param_16, - .param .u64 Unknown83_param_17, - .param .u64 Unknown83_param_18, - .param .u64 Unknown83_param_19, - .param .u64 Unknown83_param_20, - .param .u64 Unknown83_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 36863; - @%p1 bra $L__BB16_2; - ld.param.u64 %rd4, [Unknown83_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown83_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 58; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -64; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 64; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 58; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 6; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 576; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB16_2: - ret; - -} - // .globl Unknown82 -.visible .entry Unknown82( - .param .u64 Unknown82_param_0, - .param .u64 Unknown82_param_1, - .param .u64 Unknown82_param_2, - .param .u64 Unknown82_param_3, - .param .u64 Unknown82_param_4, - .param .u64 Unknown82_param_5, - .param .u64 Unknown82_param_6, - .param .u64 Unknown82_param_7, - .param .u64 Unknown82_param_8, - .param .u64 Unknown82_param_9, - .param .u64 Unknown82_param_10, - .param .u64 Unknown82_param_11, - .param .u64 Unknown82_param_12, - .param .u64 Unknown82_param_13, - .param .u64 Unknown82_param_14, - .param .u64 Unknown82_param_15, - .param .u64 Unknown82_param_16, - .param .u64 Unknown82_param_17, - .param .u64 Unknown82_param_18, - .param .u64 Unknown82_param_19, - .param .u64 Unknown82_param_20, - .param .u64 Unknown82_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 36863; - @%p1 bra $L__BB17_2; - ld.param.u64 %rd4, [Unknown82_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown82_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 58; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -64; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 64; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 58; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 6; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 576; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB17_2: - ret; - -} - // .globl Unknown81 -.visible .entry Unknown81( - .param .u64 Unknown81_param_0, - .param .u64 Unknown81_param_1, - .param .u64 Unknown81_param_2, - .param .u64 Unknown81_param_3, - .param .u64 Unknown81_param_4, - .param .u64 Unknown81_param_5, - .param .u64 Unknown81_param_6, - .param .u64 Unknown81_param_7, - .param .u64 Unknown81_param_8, - .param .u64 Unknown81_param_9, - .param .u64 Unknown81_param_10, - .param .u64 Unknown81_param_11, - .param .u64 Unknown81_param_12, - .param .u64 Unknown81_param_13, - .param .u64 Unknown81_param_14, - .param .u64 Unknown81_param_15, - .param .u64 Unknown81_param_16, - .param .u64 Unknown81_param_17, - .param .u64 Unknown81_param_18, - .param .u64 Unknown81_param_19, - .param .u64 Unknown81_param_20, - .param .u64 Unknown81_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 36863; - @%p1 bra $L__BB18_2; - ld.param.u64 %rd4, [Unknown81_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown81_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 58; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -64; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 64; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 58; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 6; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 576; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 1; - add.s64 %rd54, %rd2, %rd53; - ld.global.b16 %h1, [%rd54]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd55, %rd52, 2; - add.s64 %rd56, %rd1, %rd55; - st.global.f32 [%rd56], %f1; -$L__BB18_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 73727; + @%p1 bra $L__BB8_3; + ld.param.u64 %rd15, [Unknown84_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown84_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB8_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 73728; + @%p2 bra $L__BB8_2; +$L__BB8_3: ret; } @@ -1883,51 +607,53 @@ $L__BB18_2: .param .u64 Unknown80_param_10, .param .u64 Unknown80_param_11, .param .u64 Unknown80_param_12, - .param .u64 Unknown80_param_13 + .param .u64 Unknown80_param_13, + .param .u64 Unknown80_param_14, + .param .u64 Unknown80_param_15, + .param .u64 Unknown80_param_16, + .param .u64 Unknown80_param_17, + .param .u64 Unknown80_param_18, + .param .u64 Unknown80_param_19, + .param .u64 Unknown80_param_20, + .param .u64 Unknown80_param_21 ) { .reg .pred %p<3>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<27>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 511999; - @%p1 bra $L__BB19_2; - ld.param.u64 %rd4, [Unknown80_param_8]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown80_param_1]; - cvta.to.global.u64 %rd2, %rd5; - shr.s64 %rd8, %rd3, 63; - shr.u64 %rd9, %rd8, 55; - add.s64 %rd10, %rd3, %rd9; - and.b64 %rd11, %rd10, -512; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 512; - selp.b64 %rd14, %rd13, %rd12, %p2; - xor.b64 %rd15, %rd8, %rd3; - shr.s64 %rd16, %rd15, 63; - shr.u64 %rd17, %rd16, 55; - add.s64 %rd18, %rd15, %rd17; - shr.u64 %rd19, %rd18, 9; - xor.b64 %rd20, %rd19, %rd8; - shl.b64 %rd21, %rd20, 9; - add.s64 %rd22, %rd21, %rd14; - shl.b64 %rd23, %rd22, 1; - add.s64 %rd24, %rd2, %rd23; - ld.global.b16 %h1, [%rd24]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd25, %rd22, 2; - add.s64 %rd26, %rd1, %rd25; - st.global.f32 [%rd26], %f1; -$L__BB19_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 36863; + @%p1 bra $L__BB9_3; + ld.param.u64 %rd15, [Unknown80_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown80_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB9_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 36864; + @%p2 bra $L__BB9_2; +$L__BB9_3: ret; } @@ -1942,35 +668,49 @@ $L__BB19_2: .param .u64 Unknown79_param_6, .param .u64 Unknown79_param_7, .param .u64 Unknown79_param_8, - .param .u64 Unknown79_param_9 + .param .u64 Unknown79_param_9, + .param .u64 Unknown79_param_10, + .param .u64 Unknown79_param_11, + .param .u64 Unknown79_param_12, + .param .u64 Unknown79_param_13 ) { - .reg .pred %p<2>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<11>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; + .reg .f32 %f<2>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd7, %r3; - mul.wide.s32 %rd8, %r2, %r1; - add.s64 %rd9, %rd8, %rd7; - setp.gt.s64 %p1, %rd9, 999; - @%p1 bra $L__BB20_2; - ld.param.u64 %rd3, [Unknown79_param_6]; - cvta.to.global.u64 %rd4, %rd3; - ld.param.u64 %rd5, [Unknown79_param_1]; - cvta.to.global.u64 %rd6, %rd5; - shl.b64 %rd10, %rd9, 2; - add.s64 %rd1, %rd6, %rd10; - add.s64 %rd2, %rd4, %rd10; - ld.global.f32 %f1, [%rd1]; - cvt.rn.f16.f32 %h1, %f1; - cvt.f32.f16 %f2, %h1; - st.global.f32 [%rd2], %f2; -$L__BB20_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 511999; + @%p1 bra $L__BB10_3; + ld.param.u64 %rd15, [Unknown79_param_8]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown79_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB10_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 512000; + @%p2 bra $L__BB10_2; +$L__BB10_3: ret; } @@ -1992,32 +732,42 @@ $L__BB20_2: .param .u64 Unknown78_param_13 ) { - .reg .pred %p<2>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<12>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd7, %r3; - mul.wide.s32 %rd8, %r2, %r1; - add.s64 %rd9, %rd8, %rd7; - setp.gt.s64 %p1, %rd9, 999; - @%p1 bra $L__BB21_2; - ld.param.u64 %rd3, [Unknown78_param_8]; - cvta.to.global.u64 %rd4, %rd3; - ld.param.u64 %rd5, [Unknown78_param_1]; - cvta.to.global.u64 %rd6, %rd5; - shl.b64 %rd10, %rd9, 1; - add.s64 %rd1, %rd6, %rd10; - shl.b64 %rd11, %rd9, 2; - add.s64 %rd2, %rd4, %rd11; - ld.global.b16 %h1, [%rd1]; - cvt.f32.f16 %f1, %h1; - st.global.f32 [%rd2], %f1; -$L__BB21_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 999; + @%p1 bra $L__BB11_3; + ld.param.u64 %rd15, [Unknown78_param_8]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown78_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB11_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 1000; + @%p2 bra $L__BB11_2; +$L__BB11_3: ret; } @@ -2047,84 +797,42 @@ $L__BB21_2: .param .u64 Unknown77_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<61>; + .reg .b64 %rd<24>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 9407; - @%p1 bra $L__BB22_2; - ld.param.u64 %rd4, [Unknown77_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown77_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 1; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 7; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 7; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 1; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 1; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 7; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 7; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.s64 %rd35, %rd33, 1; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.hi.s64 %rd38, %rd37, 6148914691236517206; - shr.u64 %rd39, %rd38, 63; - add.s64 %rd40, %rd38, %rd39; - mul.lo.s64 %rd41, %rd40, 3; - sub.s64 %rd42, %rd37, %rd41; - setp.lt.s64 %p4, %rd42, 0; - add.s64 %rd43, %rd42, 3; - selp.b64 %rd44, %rd43, %rd42, %p4; - shr.s64 %rd45, %rd37, 63; - xor.b64 %rd46, %rd45, %rd37; - mul.hi.s64 %rd47, %rd46, 6148914691236517206; - shr.u64 %rd48, %rd47, 63; - add.s64 %rd49, %rd47, %rd48; - xor.b64 %rd50, %rd49, %rd45; - mul.lo.s64 %rd51, %rd50, 147; - mul.lo.s64 %rd52, %rd44, 49; - mul.lo.s64 %rd53, %rd30, 7; - add.s64 %rd54, %rd53, %rd15; - add.s64 %rd55, %rd54, %rd52; - add.s64 %rd56, %rd55, %rd51; - shl.b64 %rd57, %rd56, 1; - add.s64 %rd58, %rd2, %rd57; - ld.global.b16 %h1, [%rd58]; - cvt.f32.f16 %f1, %h1; - shl.b64 %rd59, %rd56, 2; - add.s64 %rd60, %rd1, %rd59; - st.global.f32 [%rd60], %f1; -$L__BB22_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 9407; + @%p1 bra $L__BB12_3; + ld.param.u64 %rd15, [Unknown77_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown77_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 1; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 1; + shl.b64 %rd20, %rd23, 2; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 2; +$L__BB12_2: + ld.global.nc.u16 %rs1, [%rd22]; + cvt.f32.f16 %f1, %rs1; + st.global.f32 [%rd21], %f1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 9408; + @%p2 bra $L__BB12_2; +$L__BB12_3: ret; } @@ -2165,72 +873,44 @@ $L__BB22_2: .param .u64 Unknown74_param_32 ) { - .reg .pred %p<5>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .b64 %rd<48>; + .reg .pred %p<4>; + .reg .b16 %rs<5>; + .reg .b32 %r<5>; + .reg .b64 %rd<22>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 802815; - @%p1 bra $L__BB23_2; - ld.param.u64 %rd5, [Unknown74_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown74_param_1]; - ld.param.u64 %rd7, [Unknown74_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 5; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 112; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 112; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 5; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 5; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 112; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 112; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 5; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 12544; - mul.lo.s64 %rd41, %rd32, 112; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - mov.b16 %h3, 0x0000; - setp.gt.f16 %p4, %h1, %h3; - selp.b16 %h4, %h2, 0x0000, %p4; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB23_2: + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 802815; + @%p1 bra $L__BB13_3; + ld.param.u64 %rd12, [Unknown74_param_23]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown74_param_1]; + ld.param.u64 %rd14, [Unknown74_param_12]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB13_2: + add.s64 %rd17, %rd3, %rd20; + ld.global.nc.u16 %rs1, [%rd17]; + add.s64 %rd18, %rd2, %rd20; + ld.global.nc.u16 %rs2, [%rd18]; + mov.b16 %rs3, 0x0000; + setp.gt.f16 %p2, %rs1, %rs3; + selp.b16 %rs4, %rs2, 0x0000, %p2; + add.s64 %rd19, %rd1, %rd20; + st.global.b16 [%rd19], %rs4; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p3, %rd21, 802816; + @%p3 bra $L__BB13_2; +$L__BB13_3: ret; } @@ -2271,298 +951,42 @@ $L__BB23_2: .param .u64 Unknown73_param_32 ) { - .reg .pred %p<4>; - .reg .b16 %h<4>; - .reg .b32 %r<4>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 200703; - @%p1 bra $L__BB24_2; - ld.param.u64 %rd5, [Unknown73_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown73_param_1]; - ld.param.u64 %rd7, [Unknown73_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 4; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 56; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 56; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 4; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 4; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 56; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 56; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 4; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 3136; - mul.lo.s64 %rd41, %rd32, 56; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - add.rn.f16 %h3, %h1, %h2; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h3; -$L__BB24_2: - ret; - -} - // .globl Unknown69 -.visible .entry Unknown69( - .param .u64 Unknown69_param_0, - .param .u64 Unknown69_param_1, - .param .u64 Unknown69_param_2, - .param .u64 Unknown69_param_3, - .param .u64 Unknown69_param_4, - .param .u64 Unknown69_param_5, - .param .u64 Unknown69_param_6, - .param .u64 Unknown69_param_7, - .param .u64 Unknown69_param_8, - .param .u64 Unknown69_param_9, - .param .u64 Unknown69_param_10, - .param .u64 Unknown69_param_11, - .param .u64 Unknown69_param_12, - .param .u64 Unknown69_param_13, - .param .u64 Unknown69_param_14, - .param .u64 Unknown69_param_15, - .param .u64 Unknown69_param_16, - .param .u64 Unknown69_param_17, - .param .u64 Unknown69_param_18, - .param .u64 Unknown69_param_19, - .param .u64 Unknown69_param_20, - .param .u64 Unknown69_param_21, - .param .u64 Unknown69_param_22, - .param .u64 Unknown69_param_23, - .param .u64 Unknown69_param_24, - .param .u64 Unknown69_param_25, - .param .u64 Unknown69_param_26, - .param .u64 Unknown69_param_27, - .param .u64 Unknown69_param_28, - .param .u64 Unknown69_param_29, - .param .u64 Unknown69_param_30, - .param .u64 Unknown69_param_31, - .param .u64 Unknown69_param_32 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 200703; - @%p1 bra $L__BB25_2; - ld.param.u64 %rd5, [Unknown69_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown69_param_1]; - ld.param.u64 %rd7, [Unknown69_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 4; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 56; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 56; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 4; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 4; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 56; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 56; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 4; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 3136; - mul.lo.s64 %rd41, %rd32, 56; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - mov.b16 %h3, 0x0000; - setp.gt.f16 %p4, %h1, %h3; - selp.b16 %h4, %h2, 0x0000, %p4; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB25_2: - ret; - -} - // .globl Unknown65 -.visible .entry Unknown65( - .param .u64 Unknown65_param_0, - .param .u64 Unknown65_param_1, - .param .u64 Unknown65_param_2, - .param .u64 Unknown65_param_3, - .param .u64 Unknown65_param_4, - .param .u64 Unknown65_param_5, - .param .u64 Unknown65_param_6, - .param .u64 Unknown65_param_7, - .param .u64 Unknown65_param_8, - .param .u64 Unknown65_param_9, - .param .u64 Unknown65_param_10, - .param .u64 Unknown65_param_11, - .param .u64 Unknown65_param_12, - .param .u64 Unknown65_param_13, - .param .u64 Unknown65_param_14, - .param .u64 Unknown65_param_15, - .param .u64 Unknown65_param_16, - .param .u64 Unknown65_param_17, - .param .u64 Unknown65_param_18, - .param .u64 Unknown65_param_19, - .param .u64 Unknown65_param_20, - .param .u64 Unknown65_param_21, - .param .u64 Unknown65_param_22, - .param .u64 Unknown65_param_23, - .param .u64 Unknown65_param_24, - .param .u64 Unknown65_param_25, - .param .u64 Unknown65_param_26, - .param .u64 Unknown65_param_27, - .param .u64 Unknown65_param_28, - .param .u64 Unknown65_param_29, - .param .u64 Unknown65_param_30, - .param .u64 Unknown65_param_31, - .param .u64 Unknown65_param_32, - .param .u64 Unknown65_param_33, - .param .u64 Unknown65_param_34, - .param .u64 Unknown65_param_35, - .param .u64 Unknown65_param_36, - .param .u64 Unknown65_param_37, - .param .u64 Unknown65_param_38, - .param .u64 Unknown65_param_39, - .param .u64 Unknown65_param_40, - .param .u64 Unknown65_param_41, - .param .u64 Unknown65_param_42, - .param .u64 Unknown65_param_43 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<7>; - .reg .b32 %r<4>; - .reg .b64 %rd<51>; + .reg .pred %p<3>; + .reg .b16 %rs<4>; + .reg .b32 %r<5>; + .reg .b64 %rd<22>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd10, %r3; - mul.wide.s32 %rd11, %r2, %r1; - add.s64 %rd5, %rd11, %rd10; - setp.gt.s64 %p1, %rd5, 200703; - @%p1 bra $L__BB26_2; - ld.param.u64 %rd6, [Unknown65_param_34]; - cvta.to.global.u64 %rd1, %rd6; - ld.param.u64 %rd7, [Unknown65_param_1]; - ld.param.u64 %rd8, [Unknown65_param_23]; - cvta.to.global.u64 %rd2, %rd8; - ld.param.u64 %rd9, [Unknown65_param_12]; - cvta.to.global.u64 %rd3, %rd9; - cvta.to.global.u64 %rd4, %rd7; - mul.hi.s64 %rd12, %rd5, 5270498306774157605; - shr.u64 %rd13, %rd12, 63; - shr.s64 %rd14, %rd12, 4; - add.s64 %rd15, %rd14, %rd13; - mul.lo.s64 %rd16, %rd15, 56; - sub.s64 %rd17, %rd5, %rd16; - setp.lt.s64 %p2, %rd17, 0; - add.s64 %rd18, %rd17, 56; - selp.b64 %rd19, %rd18, %rd17, %p2; - shr.s64 %rd20, %rd5, 63; - xor.b64 %rd21, %rd20, %rd5; - mul.hi.s64 %rd22, %rd21, 5270498306774157605; - shr.u64 %rd23, %rd22, 63; - shr.s64 %rd24, %rd22, 4; - add.s64 %rd25, %rd24, %rd23; - xor.b64 %rd26, %rd25, %rd20; - mul.hi.s64 %rd27, %rd26, 5270498306774157605; - shr.u64 %rd28, %rd27, 63; - shr.s64 %rd29, %rd27, 4; - add.s64 %rd30, %rd29, %rd28; - mul.lo.s64 %rd31, %rd30, 56; - sub.s64 %rd32, %rd26, %rd31; - setp.lt.s64 %p3, %rd32, 0; - add.s64 %rd33, %rd32, 56; - selp.b64 %rd34, %rd33, %rd32, %p3; - shr.s64 %rd35, %rd26, 63; - xor.b64 %rd36, %rd35, %rd26; - mul.hi.s64 %rd37, %rd36, 5270498306774157605; - shr.u64 %rd38, %rd37, 63; - shr.s64 %rd39, %rd37, 4; - add.s64 %rd40, %rd39, %rd38; - xor.b64 %rd41, %rd40, %rd35; - mul.lo.s64 %rd42, %rd41, 3136; - mul.lo.s64 %rd43, %rd34, 56; - add.s64 %rd44, %rd43, %rd19; - add.s64 %rd45, %rd44, %rd42; - shl.b64 %rd46, %rd45, 1; - add.s64 %rd47, %rd2, %rd46; - ld.global.b16 %h1, [%rd47]; - add.s64 %rd48, %rd4, %rd46; - ld.global.b16 %h2, [%rd48]; - add.s64 %rd49, %rd3, %rd46; - ld.global.b16 %h3, [%rd49]; - add.rn.f16 %h4, %h2, %h3; - mov.b16 %h5, 0x0000; - setp.gt.f16 %p4, %h1, %h5; - selp.b16 %h6, %h4, 0x0000, %p4; - add.s64 %rd50, %rd1, %rd46; - st.global.b16 [%rd50], %h6; -$L__BB26_2: + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 200703; + @%p1 bra $L__BB14_3; + ld.param.u64 %rd12, [Unknown73_param_23]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown73_param_1]; + ld.param.u64 %rd14, [Unknown73_param_12]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB14_2: + add.s64 %rd17, %rd3, %rd20; + ld.global.nc.u16 %rs1, [%rd17]; + add.s64 %rd18, %rd2, %rd20; + ld.global.nc.u16 %rs2, [%rd18]; + add.rn.f16 %rs3, %rs1, %rs2; + add.s64 %rd19, %rd1, %rd20; + st.global.b16 [%rd19], %rs3; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p2, %rd21, 200704; + @%p2 bra $L__BB14_2; +$L__BB14_3: ret; } @@ -2603,72 +1027,44 @@ $L__BB26_2: .param .u64 Unknown61_param_32 ) { - .reg .pred %p<5>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .b64 %rd<48>; + .reg .pred %p<4>; + .reg .b16 %rs<5>; + .reg .b32 %r<5>; + .reg .b64 %rd<22>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 200703; - @%p1 bra $L__BB27_2; - ld.param.u64 %rd5, [Unknown61_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown61_param_1]; - ld.param.u64 %rd7, [Unknown61_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 4; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 56; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 56; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 4; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 4; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 56; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 56; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 4; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 3136; - mul.lo.s64 %rd41, %rd32, 56; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - mov.b16 %h3, 0x0000; - setp.gt.f16 %p4, %h1, %h3; - selp.b16 %h4, %h2, 0x0000, %p4; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB27_2: + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 200703; + @%p1 bra $L__BB15_3; + ld.param.u64 %rd12, [Unknown61_param_23]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown61_param_1]; + ld.param.u64 %rd14, [Unknown61_param_12]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB15_2: + add.s64 %rd17, %rd3, %rd20; + ld.global.nc.u16 %rs1, [%rd17]; + add.s64 %rd18, %rd2, %rd20; + ld.global.nc.u16 %rs2, [%rd18]; + mov.b16 %rs3, 0x0000; + setp.gt.f16 %p2, %rs1, %rs3; + selp.b16 %rs4, %rs2, 0x0000, %p2; + add.s64 %rd19, %rd1, %rd20; + st.global.b16 [%rd19], %rs4; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p3, %rd21, 200704; + @%p3 bra $L__BB15_2; +$L__BB15_3: ret; } @@ -2720,305 +1116,49 @@ $L__BB27_2: .param .u64 Unknown57_param_43 ) { - .reg .pred %p<5>; - .reg .b16 %h<7>; - .reg .b32 %r<4>; - .reg .b64 %rd<51>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd10, %r3; - mul.wide.s32 %rd11, %r2, %r1; - add.s64 %rd5, %rd11, %rd10; - setp.gt.s64 %p1, %rd5, 200703; - @%p1 bra $L__BB28_2; - ld.param.u64 %rd6, [Unknown57_param_34]; - cvta.to.global.u64 %rd1, %rd6; - ld.param.u64 %rd7, [Unknown57_param_1]; - ld.param.u64 %rd8, [Unknown57_param_23]; - cvta.to.global.u64 %rd2, %rd8; - ld.param.u64 %rd9, [Unknown57_param_12]; - cvta.to.global.u64 %rd3, %rd9; - cvta.to.global.u64 %rd4, %rd7; - mul.hi.s64 %rd12, %rd5, 5270498306774157605; - shr.u64 %rd13, %rd12, 63; - shr.s64 %rd14, %rd12, 4; - add.s64 %rd15, %rd14, %rd13; - mul.lo.s64 %rd16, %rd15, 56; - sub.s64 %rd17, %rd5, %rd16; - setp.lt.s64 %p2, %rd17, 0; - add.s64 %rd18, %rd17, 56; - selp.b64 %rd19, %rd18, %rd17, %p2; - shr.s64 %rd20, %rd5, 63; - xor.b64 %rd21, %rd20, %rd5; - mul.hi.s64 %rd22, %rd21, 5270498306774157605; - shr.u64 %rd23, %rd22, 63; - shr.s64 %rd24, %rd22, 4; - add.s64 %rd25, %rd24, %rd23; - xor.b64 %rd26, %rd25, %rd20; - mul.hi.s64 %rd27, %rd26, 5270498306774157605; - shr.u64 %rd28, %rd27, 63; - shr.s64 %rd29, %rd27, 4; - add.s64 %rd30, %rd29, %rd28; - mul.lo.s64 %rd31, %rd30, 56; - sub.s64 %rd32, %rd26, %rd31; - setp.lt.s64 %p3, %rd32, 0; - add.s64 %rd33, %rd32, 56; - selp.b64 %rd34, %rd33, %rd32, %p3; - shr.s64 %rd35, %rd26, 63; - xor.b64 %rd36, %rd35, %rd26; - mul.hi.s64 %rd37, %rd36, 5270498306774157605; - shr.u64 %rd38, %rd37, 63; - shr.s64 %rd39, %rd37, 4; - add.s64 %rd40, %rd39, %rd38; - xor.b64 %rd41, %rd40, %rd35; - mul.lo.s64 %rd42, %rd41, 3136; - mul.lo.s64 %rd43, %rd34, 56; - add.s64 %rd44, %rd43, %rd19; - add.s64 %rd45, %rd44, %rd42; - shl.b64 %rd46, %rd45, 1; - add.s64 %rd47, %rd2, %rd46; - ld.global.b16 %h1, [%rd47]; - add.s64 %rd48, %rd4, %rd46; - ld.global.b16 %h2, [%rd48]; - add.s64 %rd49, %rd3, %rd46; - ld.global.b16 %h3, [%rd49]; - add.rn.f16 %h4, %h2, %h3; - mov.b16 %h5, 0x0000; - setp.gt.f16 %p4, %h1, %h5; - selp.b16 %h6, %h4, 0x0000, %p4; - add.s64 %rd50, %rd1, %rd46; - st.global.b16 [%rd50], %h6; -$L__BB28_2: - ret; - -} - // .globl Unknown50 -.visible .entry Unknown50( - .param .u64 Unknown50_param_0, - .param .u64 Unknown50_param_1, - .param .u64 Unknown50_param_2, - .param .u64 Unknown50_param_3, - .param .u64 Unknown50_param_4, - .param .u64 Unknown50_param_5, - .param .u64 Unknown50_param_6, - .param .u64 Unknown50_param_7, - .param .u64 Unknown50_param_8, - .param .u64 Unknown50_param_9, - .param .u64 Unknown50_param_10, - .param .u64 Unknown50_param_11, - .param .u64 Unknown50_param_12, - .param .u64 Unknown50_param_13, - .param .u64 Unknown50_param_14, - .param .u64 Unknown50_param_15, - .param .u64 Unknown50_param_16, - .param .u64 Unknown50_param_17, - .param .u64 Unknown50_param_18, - .param .u64 Unknown50_param_19, - .param .u64 Unknown50_param_20, - .param .u64 Unknown50_param_21, - .param .u64 Unknown50_param_22, - .param .u64 Unknown50_param_23, - .param .u64 Unknown50_param_24, - .param .u64 Unknown50_param_25, - .param .u64 Unknown50_param_26, - .param .u64 Unknown50_param_27, - .param .u64 Unknown50_param_28, - .param .u64 Unknown50_param_29, - .param .u64 Unknown50_param_30, - .param .u64 Unknown50_param_31, - .param .u64 Unknown50_param_32 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 100351; - @%p1 bra $L__BB29_2; - ld.param.u64 %rd5, [Unknown50_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown50_param_1]; - ld.param.u64 %rd7, [Unknown50_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 3; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 28; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 28; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 3; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 3; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 28; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 28; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 3; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 784; - mul.lo.s64 %rd41, %rd32, 28; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - mov.b16 %h3, 0x0000; - setp.gt.f16 %p4, %h1, %h3; - selp.b16 %h4, %h2, 0x0000, %p4; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB29_2: - ret; - -} - // .globl Unknown46 -.visible .entry Unknown46( - .param .u64 Unknown46_param_0, - .param .u64 Unknown46_param_1, - .param .u64 Unknown46_param_2, - .param .u64 Unknown46_param_3, - .param .u64 Unknown46_param_4, - .param .u64 Unknown46_param_5, - .param .u64 Unknown46_param_6, - .param .u64 Unknown46_param_7, - .param .u64 Unknown46_param_8, - .param .u64 Unknown46_param_9, - .param .u64 Unknown46_param_10, - .param .u64 Unknown46_param_11, - .param .u64 Unknown46_param_12, - .param .u64 Unknown46_param_13, - .param .u64 Unknown46_param_14, - .param .u64 Unknown46_param_15, - .param .u64 Unknown46_param_16, - .param .u64 Unknown46_param_17, - .param .u64 Unknown46_param_18, - .param .u64 Unknown46_param_19, - .param .u64 Unknown46_param_20, - .param .u64 Unknown46_param_21, - .param .u64 Unknown46_param_22, - .param .u64 Unknown46_param_23, - .param .u64 Unknown46_param_24, - .param .u64 Unknown46_param_25, - .param .u64 Unknown46_param_26, - .param .u64 Unknown46_param_27, - .param .u64 Unknown46_param_28, - .param .u64 Unknown46_param_29, - .param .u64 Unknown46_param_30, - .param .u64 Unknown46_param_31, - .param .u64 Unknown46_param_32, - .param .u64 Unknown46_param_33, - .param .u64 Unknown46_param_34, - .param .u64 Unknown46_param_35, - .param .u64 Unknown46_param_36, - .param .u64 Unknown46_param_37, - .param .u64 Unknown46_param_38, - .param .u64 Unknown46_param_39, - .param .u64 Unknown46_param_40, - .param .u64 Unknown46_param_41, - .param .u64 Unknown46_param_42, - .param .u64 Unknown46_param_43 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<7>; - .reg .b32 %r<4>; - .reg .b64 %rd<51>; + .reg .pred %p<4>; + .reg .b16 %rs<7>; + .reg .b32 %r<5>; + .reg .b64 %rd<25>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd10, %r3; - mul.wide.s32 %rd11, %r2, %r1; - add.s64 %rd5, %rd11, %rd10; - setp.gt.s64 %p1, %rd5, 100351; - @%p1 bra $L__BB30_2; - ld.param.u64 %rd6, [Unknown46_param_34]; - cvta.to.global.u64 %rd1, %rd6; - ld.param.u64 %rd7, [Unknown46_param_1]; - ld.param.u64 %rd8, [Unknown46_param_23]; - cvta.to.global.u64 %rd2, %rd8; - ld.param.u64 %rd9, [Unknown46_param_12]; - cvta.to.global.u64 %rd3, %rd9; - cvta.to.global.u64 %rd4, %rd7; - mul.hi.s64 %rd12, %rd5, 5270498306774157605; - shr.u64 %rd13, %rd12, 63; - shr.s64 %rd14, %rd12, 3; - add.s64 %rd15, %rd14, %rd13; - mul.lo.s64 %rd16, %rd15, 28; - sub.s64 %rd17, %rd5, %rd16; - setp.lt.s64 %p2, %rd17, 0; - add.s64 %rd18, %rd17, 28; - selp.b64 %rd19, %rd18, %rd17, %p2; - shr.s64 %rd20, %rd5, 63; - xor.b64 %rd21, %rd20, %rd5; - mul.hi.s64 %rd22, %rd21, 5270498306774157605; - shr.u64 %rd23, %rd22, 63; - shr.s64 %rd24, %rd22, 3; - add.s64 %rd25, %rd24, %rd23; - xor.b64 %rd26, %rd25, %rd20; - mul.hi.s64 %rd27, %rd26, 5270498306774157605; - shr.u64 %rd28, %rd27, 63; - shr.s64 %rd29, %rd27, 3; - add.s64 %rd30, %rd29, %rd28; - mul.lo.s64 %rd31, %rd30, 28; - sub.s64 %rd32, %rd26, %rd31; - setp.lt.s64 %p3, %rd32, 0; - add.s64 %rd33, %rd32, 28; - selp.b64 %rd34, %rd33, %rd32, %p3; - shr.s64 %rd35, %rd26, 63; - xor.b64 %rd36, %rd35, %rd26; - mul.hi.s64 %rd37, %rd36, 5270498306774157605; - shr.u64 %rd38, %rd37, 63; - shr.s64 %rd39, %rd37, 3; - add.s64 %rd40, %rd39, %rd38; - xor.b64 %rd41, %rd40, %rd35; - mul.lo.s64 %rd42, %rd41, 784; - mul.lo.s64 %rd43, %rd34, 28; - add.s64 %rd44, %rd43, %rd19; - add.s64 %rd45, %rd44, %rd42; - shl.b64 %rd46, %rd45, 1; - add.s64 %rd47, %rd2, %rd46; - ld.global.b16 %h1, [%rd47]; - add.s64 %rd48, %rd4, %rd46; - ld.global.b16 %h2, [%rd48]; - add.s64 %rd49, %rd3, %rd46; - ld.global.b16 %h3, [%rd49]; - add.rn.f16 %h4, %h2, %h3; - mov.b16 %h5, 0x0000; - setp.gt.f16 %p4, %h1, %h5; - selp.b16 %h6, %h4, 0x0000, %p4; - add.s64 %rd50, %rd1, %rd46; - st.global.b16 [%rd50], %h6; -$L__BB30_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd24, %rd18, %rd17; + setp.gt.s64 %p1, %rd24, 200703; + @%p1 bra $L__BB16_3; + ld.param.u64 %rd13, [Unknown57_param_34]; + cvta.to.global.u64 %rd1, %rd13; + ld.param.u64 %rd14, [Unknown57_param_1]; + ld.param.u64 %rd15, [Unknown57_param_23]; + cvta.to.global.u64 %rd2, %rd15; + ld.param.u64 %rd16, [Unknown57_param_12]; + cvta.to.global.u64 %rd3, %rd16; + cvta.to.global.u64 %rd4, %rd14; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd6, %r2, %r4; + shl.b64 %rd23, %rd24, 1; + shl.b64 %rd8, %rd6, 1; +$L__BB16_2: + add.s64 %rd19, %rd4, %rd23; + ld.global.nc.u16 %rs1, [%rd19]; + add.s64 %rd20, %rd3, %rd23; + ld.global.nc.u16 %rs2, [%rd20]; + add.s64 %rd21, %rd2, %rd23; + ld.global.nc.u16 %rs3, [%rd21]; + add.rn.f16 %rs4, %rs1, %rs2; + mov.b16 %rs5, 0x0000; + setp.gt.f16 %p2, %rs3, %rs5; + selp.b16 %rs6, %rs4, 0x0000, %p2; + add.s64 %rd22, %rd1, %rd23; + st.global.b16 [%rd22], %rs6; + add.s64 %rd24, %rd24, %rd6; + add.s64 %rd23, %rd23, %rd8; + setp.lt.s64 %p3, %rd24, 200704; + @%p3 bra $L__BB16_2; +$L__BB16_3: ret; } @@ -3059,72 +1199,44 @@ $L__BB30_2: .param .u64 Unknown42_param_32 ) { - .reg .pred %p<5>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .b64 %rd<48>; + .reg .pred %p<4>; + .reg .b16 %rs<5>; + .reg .b32 %r<5>; + .reg .b64 %rd<22>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 100351; - @%p1 bra $L__BB31_2; - ld.param.u64 %rd5, [Unknown42_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown42_param_1]; - ld.param.u64 %rd7, [Unknown42_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 3; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 28; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 28; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 3; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 3; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 28; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 28; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 3; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 784; - mul.lo.s64 %rd41, %rd32, 28; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - mov.b16 %h3, 0x0000; - setp.gt.f16 %p4, %h1, %h3; - selp.b16 %h4, %h2, 0x0000, %p4; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB31_2: + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 100351; + @%p1 bra $L__BB17_3; + ld.param.u64 %rd12, [Unknown42_param_23]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown42_param_1]; + ld.param.u64 %rd14, [Unknown42_param_12]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB17_2: + add.s64 %rd17, %rd3, %rd20; + ld.global.nc.u16 %rs1, [%rd17]; + add.s64 %rd18, %rd2, %rd20; + ld.global.nc.u16 %rs2, [%rd18]; + mov.b16 %rs3, 0x0000; + setp.gt.f16 %p2, %rs1, %rs3; + selp.b16 %rs4, %rs2, 0x0000, %p2; + add.s64 %rd19, %rd1, %rd20; + st.global.b16 [%rd19], %rs4; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p3, %rd21, 100352; + @%p3 bra $L__BB17_2; +$L__BB17_3: ret; } @@ -3176,305 +1288,49 @@ $L__BB31_2: .param .u64 Unknown38_param_43 ) { - .reg .pred %p<5>; - .reg .b16 %h<7>; - .reg .b32 %r<4>; - .reg .b64 %rd<51>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd10, %r3; - mul.wide.s32 %rd11, %r2, %r1; - add.s64 %rd5, %rd11, %rd10; - setp.gt.s64 %p1, %rd5, 100351; - @%p1 bra $L__BB32_2; - ld.param.u64 %rd6, [Unknown38_param_34]; - cvta.to.global.u64 %rd1, %rd6; - ld.param.u64 %rd7, [Unknown38_param_1]; - ld.param.u64 %rd8, [Unknown38_param_23]; - cvta.to.global.u64 %rd2, %rd8; - ld.param.u64 %rd9, [Unknown38_param_12]; - cvta.to.global.u64 %rd3, %rd9; - cvta.to.global.u64 %rd4, %rd7; - mul.hi.s64 %rd12, %rd5, 5270498306774157605; - shr.u64 %rd13, %rd12, 63; - shr.s64 %rd14, %rd12, 3; - add.s64 %rd15, %rd14, %rd13; - mul.lo.s64 %rd16, %rd15, 28; - sub.s64 %rd17, %rd5, %rd16; - setp.lt.s64 %p2, %rd17, 0; - add.s64 %rd18, %rd17, 28; - selp.b64 %rd19, %rd18, %rd17, %p2; - shr.s64 %rd20, %rd5, 63; - xor.b64 %rd21, %rd20, %rd5; - mul.hi.s64 %rd22, %rd21, 5270498306774157605; - shr.u64 %rd23, %rd22, 63; - shr.s64 %rd24, %rd22, 3; - add.s64 %rd25, %rd24, %rd23; - xor.b64 %rd26, %rd25, %rd20; - mul.hi.s64 %rd27, %rd26, 5270498306774157605; - shr.u64 %rd28, %rd27, 63; - shr.s64 %rd29, %rd27, 3; - add.s64 %rd30, %rd29, %rd28; - mul.lo.s64 %rd31, %rd30, 28; - sub.s64 %rd32, %rd26, %rd31; - setp.lt.s64 %p3, %rd32, 0; - add.s64 %rd33, %rd32, 28; - selp.b64 %rd34, %rd33, %rd32, %p3; - shr.s64 %rd35, %rd26, 63; - xor.b64 %rd36, %rd35, %rd26; - mul.hi.s64 %rd37, %rd36, 5270498306774157605; - shr.u64 %rd38, %rd37, 63; - shr.s64 %rd39, %rd37, 3; - add.s64 %rd40, %rd39, %rd38; - xor.b64 %rd41, %rd40, %rd35; - mul.lo.s64 %rd42, %rd41, 784; - mul.lo.s64 %rd43, %rd34, 28; - add.s64 %rd44, %rd43, %rd19; - add.s64 %rd45, %rd44, %rd42; - shl.b64 %rd46, %rd45, 1; - add.s64 %rd47, %rd2, %rd46; - ld.global.b16 %h1, [%rd47]; - add.s64 %rd48, %rd4, %rd46; - ld.global.b16 %h2, [%rd48]; - add.s64 %rd49, %rd3, %rd46; - ld.global.b16 %h3, [%rd49]; - add.rn.f16 %h4, %h2, %h3; - mov.b16 %h5, 0x0000; - setp.gt.f16 %p4, %h1, %h5; - selp.b16 %h6, %h4, 0x0000, %p4; - add.s64 %rd50, %rd1, %rd46; - st.global.b16 [%rd50], %h6; -$L__BB32_2: - ret; - -} - // .globl Unknown31 -.visible .entry Unknown31( - .param .u64 Unknown31_param_0, - .param .u64 Unknown31_param_1, - .param .u64 Unknown31_param_2, - .param .u64 Unknown31_param_3, - .param .u64 Unknown31_param_4, - .param .u64 Unknown31_param_5, - .param .u64 Unknown31_param_6, - .param .u64 Unknown31_param_7, - .param .u64 Unknown31_param_8, - .param .u64 Unknown31_param_9, - .param .u64 Unknown31_param_10, - .param .u64 Unknown31_param_11, - .param .u64 Unknown31_param_12, - .param .u64 Unknown31_param_13, - .param .u64 Unknown31_param_14, - .param .u64 Unknown31_param_15, - .param .u64 Unknown31_param_16, - .param .u64 Unknown31_param_17, - .param .u64 Unknown31_param_18, - .param .u64 Unknown31_param_19, - .param .u64 Unknown31_param_20, - .param .u64 Unknown31_param_21, - .param .u64 Unknown31_param_22, - .param .u64 Unknown31_param_23, - .param .u64 Unknown31_param_24, - .param .u64 Unknown31_param_25, - .param .u64 Unknown31_param_26, - .param .u64 Unknown31_param_27, - .param .u64 Unknown31_param_28, - .param .u64 Unknown31_param_29, - .param .u64 Unknown31_param_30, - .param .u64 Unknown31_param_31, - .param .u64 Unknown31_param_32 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 50175; - @%p1 bra $L__BB33_2; - ld.param.u64 %rd5, [Unknown31_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown31_param_1]; - ld.param.u64 %rd7, [Unknown31_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 2; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 14; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 14; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 2; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 2; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 14; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 14; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 2; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 196; - mul.lo.s64 %rd41, %rd32, 14; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - mov.b16 %h3, 0x0000; - setp.gt.f16 %p4, %h1, %h3; - selp.b16 %h4, %h2, 0x0000, %p4; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB33_2: - ret; - -} - // .globl Unknown27 -.visible .entry Unknown27( - .param .u64 Unknown27_param_0, - .param .u64 Unknown27_param_1, - .param .u64 Unknown27_param_2, - .param .u64 Unknown27_param_3, - .param .u64 Unknown27_param_4, - .param .u64 Unknown27_param_5, - .param .u64 Unknown27_param_6, - .param .u64 Unknown27_param_7, - .param .u64 Unknown27_param_8, - .param .u64 Unknown27_param_9, - .param .u64 Unknown27_param_10, - .param .u64 Unknown27_param_11, - .param .u64 Unknown27_param_12, - .param .u64 Unknown27_param_13, - .param .u64 Unknown27_param_14, - .param .u64 Unknown27_param_15, - .param .u64 Unknown27_param_16, - .param .u64 Unknown27_param_17, - .param .u64 Unknown27_param_18, - .param .u64 Unknown27_param_19, - .param .u64 Unknown27_param_20, - .param .u64 Unknown27_param_21, - .param .u64 Unknown27_param_22, - .param .u64 Unknown27_param_23, - .param .u64 Unknown27_param_24, - .param .u64 Unknown27_param_25, - .param .u64 Unknown27_param_26, - .param .u64 Unknown27_param_27, - .param .u64 Unknown27_param_28, - .param .u64 Unknown27_param_29, - .param .u64 Unknown27_param_30, - .param .u64 Unknown27_param_31, - .param .u64 Unknown27_param_32, - .param .u64 Unknown27_param_33, - .param .u64 Unknown27_param_34, - .param .u64 Unknown27_param_35, - .param .u64 Unknown27_param_36, - .param .u64 Unknown27_param_37, - .param .u64 Unknown27_param_38, - .param .u64 Unknown27_param_39, - .param .u64 Unknown27_param_40, - .param .u64 Unknown27_param_41, - .param .u64 Unknown27_param_42, - .param .u64 Unknown27_param_43 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<7>; - .reg .b32 %r<4>; - .reg .b64 %rd<51>; + .reg .pred %p<4>; + .reg .b16 %rs<7>; + .reg .b32 %r<5>; + .reg .b64 %rd<25>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd10, %r3; - mul.wide.s32 %rd11, %r2, %r1; - add.s64 %rd5, %rd11, %rd10; - setp.gt.s64 %p1, %rd5, 50175; - @%p1 bra $L__BB34_2; - ld.param.u64 %rd6, [Unknown27_param_34]; - cvta.to.global.u64 %rd1, %rd6; - ld.param.u64 %rd7, [Unknown27_param_1]; - ld.param.u64 %rd8, [Unknown27_param_23]; - cvta.to.global.u64 %rd2, %rd8; - ld.param.u64 %rd9, [Unknown27_param_12]; - cvta.to.global.u64 %rd3, %rd9; - cvta.to.global.u64 %rd4, %rd7; - mul.hi.s64 %rd12, %rd5, 5270498306774157605; - shr.u64 %rd13, %rd12, 63; - shr.s64 %rd14, %rd12, 2; - add.s64 %rd15, %rd14, %rd13; - mul.lo.s64 %rd16, %rd15, 14; - sub.s64 %rd17, %rd5, %rd16; - setp.lt.s64 %p2, %rd17, 0; - add.s64 %rd18, %rd17, 14; - selp.b64 %rd19, %rd18, %rd17, %p2; - shr.s64 %rd20, %rd5, 63; - xor.b64 %rd21, %rd20, %rd5; - mul.hi.s64 %rd22, %rd21, 5270498306774157605; - shr.u64 %rd23, %rd22, 63; - shr.s64 %rd24, %rd22, 2; - add.s64 %rd25, %rd24, %rd23; - xor.b64 %rd26, %rd25, %rd20; - mul.hi.s64 %rd27, %rd26, 5270498306774157605; - shr.u64 %rd28, %rd27, 63; - shr.s64 %rd29, %rd27, 2; - add.s64 %rd30, %rd29, %rd28; - mul.lo.s64 %rd31, %rd30, 14; - sub.s64 %rd32, %rd26, %rd31; - setp.lt.s64 %p3, %rd32, 0; - add.s64 %rd33, %rd32, 14; - selp.b64 %rd34, %rd33, %rd32, %p3; - shr.s64 %rd35, %rd26, 63; - xor.b64 %rd36, %rd35, %rd26; - mul.hi.s64 %rd37, %rd36, 5270498306774157605; - shr.u64 %rd38, %rd37, 63; - shr.s64 %rd39, %rd37, 2; - add.s64 %rd40, %rd39, %rd38; - xor.b64 %rd41, %rd40, %rd35; - mul.lo.s64 %rd42, %rd41, 196; - mul.lo.s64 %rd43, %rd34, 14; - add.s64 %rd44, %rd43, %rd19; - add.s64 %rd45, %rd44, %rd42; - shl.b64 %rd46, %rd45, 1; - add.s64 %rd47, %rd2, %rd46; - ld.global.b16 %h1, [%rd47]; - add.s64 %rd48, %rd4, %rd46; - ld.global.b16 %h2, [%rd48]; - add.s64 %rd49, %rd3, %rd46; - ld.global.b16 %h3, [%rd49]; - add.rn.f16 %h4, %h2, %h3; - mov.b16 %h5, 0x0000; - setp.gt.f16 %p4, %h1, %h5; - selp.b16 %h6, %h4, 0x0000, %p4; - add.s64 %rd50, %rd1, %rd46; - st.global.b16 [%rd50], %h6; -$L__BB34_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd24, %rd18, %rd17; + setp.gt.s64 %p1, %rd24, 100351; + @%p1 bra $L__BB18_3; + ld.param.u64 %rd13, [Unknown38_param_34]; + cvta.to.global.u64 %rd1, %rd13; + ld.param.u64 %rd14, [Unknown38_param_1]; + ld.param.u64 %rd15, [Unknown38_param_23]; + cvta.to.global.u64 %rd2, %rd15; + ld.param.u64 %rd16, [Unknown38_param_12]; + cvta.to.global.u64 %rd3, %rd16; + cvta.to.global.u64 %rd4, %rd14; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd6, %r2, %r4; + shl.b64 %rd23, %rd24, 1; + shl.b64 %rd8, %rd6, 1; +$L__BB18_2: + add.s64 %rd19, %rd4, %rd23; + ld.global.nc.u16 %rs1, [%rd19]; + add.s64 %rd20, %rd3, %rd23; + ld.global.nc.u16 %rs2, [%rd20]; + add.s64 %rd21, %rd2, %rd23; + ld.global.nc.u16 %rs3, [%rd21]; + add.rn.f16 %rs4, %rs1, %rs2; + mov.b16 %rs5, 0x0000; + setp.gt.f16 %p2, %rs3, %rs5; + selp.b16 %rs6, %rs4, 0x0000, %p2; + add.s64 %rd22, %rd1, %rd23; + st.global.b16 [%rd22], %rs6; + add.s64 %rd24, %rd24, %rd6; + add.s64 %rd23, %rd23, %rd8; + setp.lt.s64 %p3, %rd24, 100352; + @%p3 bra $L__BB18_2; +$L__BB18_3: ret; } @@ -3515,72 +1371,44 @@ $L__BB34_2: .param .u64 Unknown23_param_32 ) { - .reg .pred %p<5>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .b64 %rd<48>; + .reg .pred %p<4>; + .reg .b16 %rs<5>; + .reg .b32 %r<5>; + .reg .b64 %rd<22>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 50175; - @%p1 bra $L__BB35_2; - ld.param.u64 %rd5, [Unknown23_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown23_param_1]; - ld.param.u64 %rd7, [Unknown23_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 2; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 14; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 14; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 2; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 2; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 14; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 14; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 2; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 196; - mul.lo.s64 %rd41, %rd32, 14; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - mov.b16 %h3, 0x0000; - setp.gt.f16 %p4, %h1, %h3; - selp.b16 %h4, %h2, 0x0000, %p4; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB35_2: + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 50175; + @%p1 bra $L__BB19_3; + ld.param.u64 %rd12, [Unknown23_param_23]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown23_param_1]; + ld.param.u64 %rd14, [Unknown23_param_12]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB19_2: + add.s64 %rd17, %rd3, %rd20; + ld.global.nc.u16 %rs1, [%rd17]; + add.s64 %rd18, %rd2, %rd20; + ld.global.nc.u16 %rs2, [%rd18]; + mov.b16 %rs3, 0x0000; + setp.gt.f16 %p2, %rs1, %rs3; + selp.b16 %rs4, %rs2, 0x0000, %p2; + add.s64 %rd19, %rd1, %rd20; + st.global.b16 [%rd19], %rs4; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p3, %rd21, 50176; + @%p3 bra $L__BB19_2; +$L__BB19_3: ret; } @@ -3632,183 +1460,49 @@ $L__BB35_2: .param .u64 Unknown19_param_43 ) { - .reg .pred %p<5>; - .reg .b16 %h<7>; - .reg .b32 %r<4>; - .reg .b64 %rd<51>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd10, %r3; - mul.wide.s32 %rd11, %r2, %r1; - add.s64 %rd5, %rd11, %rd10; - setp.gt.s64 %p1, %rd5, 50175; - @%p1 bra $L__BB36_2; - ld.param.u64 %rd6, [Unknown19_param_34]; - cvta.to.global.u64 %rd1, %rd6; - ld.param.u64 %rd7, [Unknown19_param_1]; - ld.param.u64 %rd8, [Unknown19_param_23]; - cvta.to.global.u64 %rd2, %rd8; - ld.param.u64 %rd9, [Unknown19_param_12]; - cvta.to.global.u64 %rd3, %rd9; - cvta.to.global.u64 %rd4, %rd7; - mul.hi.s64 %rd12, %rd5, 5270498306774157605; - shr.u64 %rd13, %rd12, 63; - shr.s64 %rd14, %rd12, 2; - add.s64 %rd15, %rd14, %rd13; - mul.lo.s64 %rd16, %rd15, 14; - sub.s64 %rd17, %rd5, %rd16; - setp.lt.s64 %p2, %rd17, 0; - add.s64 %rd18, %rd17, 14; - selp.b64 %rd19, %rd18, %rd17, %p2; - shr.s64 %rd20, %rd5, 63; - xor.b64 %rd21, %rd20, %rd5; - mul.hi.s64 %rd22, %rd21, 5270498306774157605; - shr.u64 %rd23, %rd22, 63; - shr.s64 %rd24, %rd22, 2; - add.s64 %rd25, %rd24, %rd23; - xor.b64 %rd26, %rd25, %rd20; - mul.hi.s64 %rd27, %rd26, 5270498306774157605; - shr.u64 %rd28, %rd27, 63; - shr.s64 %rd29, %rd27, 2; - add.s64 %rd30, %rd29, %rd28; - mul.lo.s64 %rd31, %rd30, 14; - sub.s64 %rd32, %rd26, %rd31; - setp.lt.s64 %p3, %rd32, 0; - add.s64 %rd33, %rd32, 14; - selp.b64 %rd34, %rd33, %rd32, %p3; - shr.s64 %rd35, %rd26, 63; - xor.b64 %rd36, %rd35, %rd26; - mul.hi.s64 %rd37, %rd36, 5270498306774157605; - shr.u64 %rd38, %rd37, 63; - shr.s64 %rd39, %rd37, 2; - add.s64 %rd40, %rd39, %rd38; - xor.b64 %rd41, %rd40, %rd35; - mul.lo.s64 %rd42, %rd41, 196; - mul.lo.s64 %rd43, %rd34, 14; - add.s64 %rd44, %rd43, %rd19; - add.s64 %rd45, %rd44, %rd42; - shl.b64 %rd46, %rd45, 1; - add.s64 %rd47, %rd2, %rd46; - ld.global.b16 %h1, [%rd47]; - add.s64 %rd48, %rd4, %rd46; - ld.global.b16 %h2, [%rd48]; - add.s64 %rd49, %rd3, %rd46; - ld.global.b16 %h3, [%rd49]; - add.rn.f16 %h4, %h2, %h3; - mov.b16 %h5, 0x0000; - setp.gt.f16 %p4, %h1, %h5; - selp.b16 %h6, %h4, 0x0000, %p4; - add.s64 %rd50, %rd1, %rd46; - st.global.b16 [%rd50], %h6; -$L__BB36_2: - ret; - -} - // .globl Unknown12 -.visible .entry Unknown12( - .param .u64 Unknown12_param_0, - .param .u64 Unknown12_param_1, - .param .u64 Unknown12_param_2, - .param .u64 Unknown12_param_3, - .param .u64 Unknown12_param_4, - .param .u64 Unknown12_param_5, - .param .u64 Unknown12_param_6, - .param .u64 Unknown12_param_7, - .param .u64 Unknown12_param_8, - .param .u64 Unknown12_param_9, - .param .u64 Unknown12_param_10, - .param .u64 Unknown12_param_11, - .param .u64 Unknown12_param_12, - .param .u64 Unknown12_param_13, - .param .u64 Unknown12_param_14, - .param .u64 Unknown12_param_15, - .param .u64 Unknown12_param_16, - .param .u64 Unknown12_param_17, - .param .u64 Unknown12_param_18, - .param .u64 Unknown12_param_19, - .param .u64 Unknown12_param_20, - .param .u64 Unknown12_param_21, - .param .u64 Unknown12_param_22, - .param .u64 Unknown12_param_23, - .param .u64 Unknown12_param_24, - .param .u64 Unknown12_param_25, - .param .u64 Unknown12_param_26, - .param .u64 Unknown12_param_27, - .param .u64 Unknown12_param_28, - .param .u64 Unknown12_param_29, - .param .u64 Unknown12_param_30, - .param .u64 Unknown12_param_31, - .param .u64 Unknown12_param_32 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .b64 %rd<48>; + .reg .pred %p<4>; + .reg .b16 %rs<7>; + .reg .b32 %r<5>; + .reg .b64 %rd<25>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 25087; - @%p1 bra $L__BB37_2; - ld.param.u64 %rd5, [Unknown12_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown12_param_1]; - ld.param.u64 %rd7, [Unknown12_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 1; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 7; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 7; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 1; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 1; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 7; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 7; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.u64 %rd37, %rd35, 1; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 49; - mul.lo.s64 %rd41, %rd32, 7; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - mov.b16 %h3, 0x0000; - setp.gt.f16 %p4, %h1, %h3; - selp.b16 %h4, %h2, 0x0000, %p4; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB37_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd24, %rd18, %rd17; + setp.gt.s64 %p1, %rd24, 50175; + @%p1 bra $L__BB20_3; + ld.param.u64 %rd13, [Unknown19_param_34]; + cvta.to.global.u64 %rd1, %rd13; + ld.param.u64 %rd14, [Unknown19_param_1]; + ld.param.u64 %rd15, [Unknown19_param_23]; + cvta.to.global.u64 %rd2, %rd15; + ld.param.u64 %rd16, [Unknown19_param_12]; + cvta.to.global.u64 %rd3, %rd16; + cvta.to.global.u64 %rd4, %rd14; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd6, %r2, %r4; + shl.b64 %rd23, %rd24, 1; + shl.b64 %rd8, %rd6, 1; +$L__BB20_2: + add.s64 %rd19, %rd4, %rd23; + ld.global.nc.u16 %rs1, [%rd19]; + add.s64 %rd20, %rd3, %rd23; + ld.global.nc.u16 %rs2, [%rd20]; + add.s64 %rd21, %rd2, %rd23; + ld.global.nc.u16 %rs3, [%rd21]; + add.rn.f16 %rs4, %rs1, %rs2; + mov.b16 %rs5, 0x0000; + setp.gt.f16 %p2, %rs3, %rs5; + selp.b16 %rs6, %rs4, 0x0000, %p2; + add.s64 %rd22, %rd1, %rd23; + st.global.b16 [%rd22], %rs6; + add.s64 %rd24, %rd24, %rd6; + add.s64 %rd23, %rd23, %rd8; + setp.lt.s64 %p3, %rd24, 50176; + @%p3 bra $L__BB20_2; +$L__BB20_3: ret; } @@ -3860,77 +1554,49 @@ $L__BB37_2: .param .u64 Unknown8_param_43 ) { - .reg .pred %p<5>; - .reg .b16 %h<7>; - .reg .b32 %r<4>; - .reg .b64 %rd<51>; + .reg .pred %p<4>; + .reg .b16 %rs<7>; + .reg .b32 %r<5>; + .reg .b64 %rd<25>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd10, %r3; - mul.wide.s32 %rd11, %r2, %r1; - add.s64 %rd5, %rd11, %rd10; - setp.gt.s64 %p1, %rd5, 25087; - @%p1 bra $L__BB38_2; - ld.param.u64 %rd6, [Unknown8_param_34]; - cvta.to.global.u64 %rd1, %rd6; - ld.param.u64 %rd7, [Unknown8_param_1]; - ld.param.u64 %rd8, [Unknown8_param_23]; - cvta.to.global.u64 %rd2, %rd8; - ld.param.u64 %rd9, [Unknown8_param_12]; - cvta.to.global.u64 %rd3, %rd9; - cvta.to.global.u64 %rd4, %rd7; - mul.hi.s64 %rd12, %rd5, 5270498306774157605; - shr.u64 %rd13, %rd12, 63; - shr.s64 %rd14, %rd12, 1; - add.s64 %rd15, %rd14, %rd13; - mul.lo.s64 %rd16, %rd15, 7; - sub.s64 %rd17, %rd5, %rd16; - setp.lt.s64 %p2, %rd17, 0; - add.s64 %rd18, %rd17, 7; - selp.b64 %rd19, %rd18, %rd17, %p2; - shr.s64 %rd20, %rd5, 63; - xor.b64 %rd21, %rd20, %rd5; - mul.hi.s64 %rd22, %rd21, 5270498306774157605; - shr.u64 %rd23, %rd22, 63; - shr.s64 %rd24, %rd22, 1; - add.s64 %rd25, %rd24, %rd23; - xor.b64 %rd26, %rd25, %rd20; - mul.hi.s64 %rd27, %rd26, 5270498306774157605; - shr.u64 %rd28, %rd27, 63; - shr.s64 %rd29, %rd27, 1; - add.s64 %rd30, %rd29, %rd28; - mul.lo.s64 %rd31, %rd30, 7; - sub.s64 %rd32, %rd26, %rd31; - setp.lt.s64 %p3, %rd32, 0; - add.s64 %rd33, %rd32, 7; - selp.b64 %rd34, %rd33, %rd32, %p3; - shr.s64 %rd35, %rd26, 63; - xor.b64 %rd36, %rd35, %rd26; - mul.hi.s64 %rd37, %rd36, 5270498306774157605; - shr.u64 %rd38, %rd37, 63; - shr.u64 %rd39, %rd37, 1; - add.s64 %rd40, %rd39, %rd38; - xor.b64 %rd41, %rd40, %rd35; - mul.lo.s64 %rd42, %rd41, 49; - mul.lo.s64 %rd43, %rd34, 7; - add.s64 %rd44, %rd43, %rd19; - add.s64 %rd45, %rd44, %rd42; - shl.b64 %rd46, %rd45, 1; - add.s64 %rd47, %rd2, %rd46; - ld.global.b16 %h1, [%rd47]; - add.s64 %rd48, %rd4, %rd46; - ld.global.b16 %h2, [%rd48]; - add.s64 %rd49, %rd3, %rd46; - ld.global.b16 %h3, [%rd49]; - add.rn.f16 %h4, %h2, %h3; - mov.b16 %h5, 0x0000; - setp.gt.f16 %p4, %h1, %h5; - selp.b16 %h6, %h4, 0x0000, %p4; - add.s64 %rd50, %rd1, %rd46; - st.global.b16 [%rd50], %h6; -$L__BB38_2: + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd24, %rd18, %rd17; + setp.gt.s64 %p1, %rd24, 25087; + @%p1 bra $L__BB21_3; + ld.param.u64 %rd13, [Unknown8_param_34]; + cvta.to.global.u64 %rd1, %rd13; + ld.param.u64 %rd14, [Unknown8_param_1]; + ld.param.u64 %rd15, [Unknown8_param_23]; + cvta.to.global.u64 %rd2, %rd15; + ld.param.u64 %rd16, [Unknown8_param_12]; + cvta.to.global.u64 %rd3, %rd16; + cvta.to.global.u64 %rd4, %rd14; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd6, %r2, %r4; + shl.b64 %rd23, %rd24, 1; + shl.b64 %rd8, %rd6, 1; +$L__BB21_2: + add.s64 %rd19, %rd4, %rd23; + ld.global.nc.u16 %rs1, [%rd19]; + add.s64 %rd20, %rd3, %rd23; + ld.global.nc.u16 %rs2, [%rd20]; + add.s64 %rd21, %rd2, %rd23; + ld.global.nc.u16 %rs3, [%rd21]; + add.rn.f16 %rs4, %rs1, %rs2; + mov.b16 %rs5, 0x0000; + setp.gt.f16 %p2, %rs3, %rs5; + selp.b16 %rs6, %rs4, 0x0000, %p2; + add.s64 %rd22, %rd1, %rd23; + st.global.b16 [%rd22], %rs6; + add.s64 %rd24, %rd24, %rd6; + add.s64 %rd23, %rd23, %rd8; + setp.lt.s64 %p3, %rd24, 25088; + @%p3 bra $L__BB21_2; +$L__BB21_3: ret; } @@ -3971,72 +1637,44 @@ $L__BB38_2: .param .u64 Unknown4_param_32 ) { - .reg .pred %p<5>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .b64 %rd<48>; + .reg .pred %p<4>; + .reg .b16 %rs<5>; + .reg .b32 %r<5>; + .reg .b64 %rd<22>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 25087; - @%p1 bra $L__BB39_2; - ld.param.u64 %rd5, [Unknown4_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown4_param_1]; - ld.param.u64 %rd7, [Unknown4_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 1; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 7; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 7; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 1; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 1; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 7; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 7; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.u64 %rd37, %rd35, 1; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 49; - mul.lo.s64 %rd41, %rd32, 7; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - mov.b16 %h3, 0x0000; - setp.gt.f16 %p4, %h1, %h3; - selp.b16 %h4, %h2, 0x0000, %p4; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB39_2: + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 25087; + @%p1 bra $L__BB22_3; + ld.param.u64 %rd12, [Unknown4_param_23]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown4_param_1]; + ld.param.u64 %rd14, [Unknown4_param_12]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB22_2: + add.s64 %rd17, %rd3, %rd20; + ld.global.nc.u16 %rs1, [%rd17]; + add.s64 %rd18, %rd2, %rd20; + ld.global.nc.u16 %rs2, [%rd18]; + mov.b16 %rs3, 0x0000; + setp.gt.f16 %p2, %rs1, %rs3; + selp.b16 %rs4, %rs2, 0x0000, %p2; + add.s64 %rd19, %rd1, %rd20; + st.global.b16 [%rd19], %rs4; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p3, %rd21, 25088; + @%p3 bra $L__BB22_2; +$L__BB22_3: ret; } @@ -4073,77 +1711,53 @@ $L__BB39_2: .param .u64 Unknown0_param_28 ) { - .reg .pred %p<5>; - .reg .b16 %h<6>; - .reg .b32 %r<4>; + .reg .pred %p<4>; + .reg .b16 %rs<6>; + .reg .b32 %r<5>; .reg .f32 %f<3>; - .reg .b64 %rd<49>; + .reg .b64 %rd<27>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 25087; - @%p1 bra $L__BB40_2; - ld.param.u64 %rd5, [Unknown0_param_19]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown0_param_1]; - ld.param.u64 %rd7, [Unknown0_param_8]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 1; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 7; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 7; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 1; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 1; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 7; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 7; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 1; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 49; - mul.lo.s64 %rd41, %rd32, 7; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd2, %rd44; - ld.global.b16 %h1, [%rd45]; - shl.b64 %rd46, %rd39, 1; - add.s64 %rd47, %rd3, %rd46; - ld.global.b16 %h2, [%rd47]; - cvt.f32.f16 %f1, %h2; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd26, %rd16, %rd15; + setp.gt.s64 %p1, %rd26, 25087; + @%p1 bra $L__BB23_3; + ld.param.u64 %rd12, [Unknown0_param_19]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown0_param_1]; + ld.param.u64 %rd14, [Unknown0_param_8]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd25, %rd26, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB23_2: + mul.hi.s64 %rd17, %rd26, 6023426636313322977; + shr.u64 %rd18, %rd17, 63; + shr.s64 %rd19, %rd17, 4; + add.s64 %rd20, %rd19, %rd18; + shl.b64 %rd21, %rd20, 1; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.u16 %rs1, [%rd22]; + add.s64 %rd23, %rd2, %rd25; + ld.global.nc.u16 %rs2, [%rd23]; + cvt.f32.f16 %f1, %rs1; div.rn.f32 %f2, %f1, 0f42440000; - cvt.rn.f16.f32 %h3, %f2; - mov.b16 %h4, 0x0000; - setp.gt.f16 %p4, %h1, %h4; - selp.b16 %h5, %h3, 0x0000, %p4; - add.s64 %rd48, %rd1, %rd44; - st.global.b16 [%rd48], %h5; -$L__BB40_2: + cvt.rn.f16.f32 %rs3, %f2; + mov.b16 %rs4, 0x0000; + setp.gt.f16 %p2, %rs2, %rs4; + selp.b16 %rs5, %rs3, 0x0000, %p2; + add.s64 %rd24, %rd1, %rd25; + st.global.b16 [%rd24], %rs5; + add.s64 %rd26, %rd26, %rd5; + add.s64 %rd25, %rd25, %rd7; + setp.lt.s64 %p3, %rd26, 25088; + @%p3 bra $L__BB23_2; +$L__BB23_3: ret; } diff --git a/compiler/test/E2E/ResNet18/BW/host_output.mlir b/compiler/test/E2E/ResNet18/BW/host_output.mlir index 0ad518caf..392748525 100644 --- a/compiler/test/E2E/ResNet18/BW/host_output.mlir +++ b/compiler/test/E2E/ResNet18/BW/host_output.mlir @@ -5,161 +5,157 @@ module attributes {byre.container_module, gpu.container_module} { func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<128xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<128xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<128xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<128xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<128xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<128xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<128xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<256xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<256xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<256xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<256xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<256xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<256xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<256xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<256xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<256xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<256xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<512xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<512xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<512xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<512xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<512xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<512xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<512xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<512xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<512xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<512xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<64xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<64xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<64xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<64xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<64xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<64xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<64xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<64xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<64xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<64xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<128xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<128xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<128xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<128xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<128xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<128xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<128xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<128xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<512xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<512xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<512xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<512xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<512xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<512xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<512xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input123", byre.argtype = 1 : i32}, %arg124: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input124", byre.argtype = 1 : i32}, %arg125: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Input125", byre.argtype = 1 : i32}, %arg126: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input126", byre.argtype = 1 : i32}, %arg127: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input127", byre.argtype = 1 : i32}, %arg128: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input128", byre.argtype = 1 : i32}, %arg129: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input129", byre.argtype = 1 : i32}, %arg130: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Input130", byre.argtype = 1 : i32}, %arg131: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input131", byre.argtype = 1 : i32}, %arg132: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input132", byre.argtype = 1 : i32}, %arg133: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input133", byre.argtype = 1 : i32}, %arg134: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input134", byre.argtype = 1 : i32}, %arg135: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input135", byre.argtype = 1 : i32}, %arg136: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input136", byre.argtype = 1 : i32}, %arg137: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input137", byre.argtype = 1 : i32}, %arg138: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input138", byre.argtype = 1 : i32}, %arg139: memref<1x512xf16, "cuda"> {byre.argname = "Input139", byre.argtype = 1 : i32}, %arg140: memref<512x1000xf16, "cuda"> {byre.argname = "Input140", byre.argtype = 1 : i32}, %arg141: memref<1x1000xf16, "cuda"> {byre.argname = "Input141", byre.argtype = 1 : i32}, %arg142: memref<64xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg143: memref<64xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg144: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg145: memref<1000xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg146: memref<1000x512xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg147: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg148: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg149: memref<64xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg150: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg151: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg152: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg153: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg154: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg155: memref<64xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg156: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg157: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg158: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg159: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg160: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg161: memref<128xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg162: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg163: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg164: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg165: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg166: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg167: memref<128xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg168: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg169: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg170: memref<128xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg171: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg172: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg173: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg174: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg175: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg176: memref<256xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg177: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg178: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg179: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg180: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg181: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg182: memref<256xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg183: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg187: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg188: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg189: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg190: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg191: memref<512xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg192: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg193: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg194: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg195: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg202: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg203: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} { %alloc = memref.alloc() : memref<25927680xi8, "cuda"> - %0 = "byre.alias"(%alloc) {device = "cuda", offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda"> byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16, "cuda">, memref<512x1000xf16, "cuda">, memref<1x512xf16, "cuda"> - %1 = "byre.alias"(%alloc) {device = "cuda", offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %2 = "byre.alias"(%alloc) {device = "cuda", offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg137, %arg39, %1, %2, %arg201, %arg200) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %3 = "byre.alias"(%alloc) {device = "cuda", offset = 16540672 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %3 = "byre.alias"(%alloc) <{offset = 16540672 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%2, %arg136, %3) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %4 = "byre.alias"(%alloc) {device = "cuda", offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %4 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg135, %2, %4) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg134, %arg37, %2, %3, %arg199, %arg198) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %5 = "byre.alias"(%alloc) {device = "cuda", offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %5 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%3, %arg133, %5) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %6 = "byre.alias"(%alloc) {device = "cuda", offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %6 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg132, %3, %6) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - %7 = "byre.alias"(%alloc) {device = "cuda", offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %7 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg129, %arg33, %7, %5, %arg192, %arg191) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %8 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %8 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%5, %arg128, %8) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %9 = "byre.alias"(%alloc) {device = "cuda", offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %9 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %5, %9) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown12", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %10 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %10 = "byre.alias"(%alloc) <{offset = 10970112 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg126, %arg31, %5, %10, %arg190, %arg189) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %11 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %11 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg125, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %12 = "byre.alias"(%alloc) {device = "cuda", offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> + %12 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %12) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %10, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %13 = "byre.alias"(%alloc) {device = "cuda", offset = 12625920 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg130, %13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %14 = "byre.alias"(%alloc) {device = "cuda", offset = 819200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> - %15 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%13, %11, %arg124, %15) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %15, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %16 = "byre.alias"(%alloc) {device = "cuda", offset = 11020288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %17 = "byre.alias"(%alloc) {device = "cuda", offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg121, %16, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %16, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg119, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %18 = "byre.alias"(%alloc) {device = "cuda", offset = 7380992 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %16, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - %19 = "byre.alias"(%alloc) {device = "cuda", offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%15, %11, %arg118, %19) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown27", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %19, %11, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg114, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %20 = "byre.alias"(%alloc) {device = "cuda", offset = 8560640 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %11, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg113, %15, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown31", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %21 = "byre.alias"(%alloc) {device = "cuda", offset = 6490112 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %11, %21, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %22 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%21, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %23 = "byre.alias"(%alloc) {device = "cuda", offset = 6791168 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %21, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> - %24 = "byre.alias"(%alloc) {device = "cuda", offset = 11120640 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %19, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %25 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %13 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %13, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + %14 = "byre.alias"(%alloc) <{offset = 12625920 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%13, %arg130, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %15 = "byre.alias"(%alloc) <{offset = 819200 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %13, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> + %16 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%14, %11, %arg124, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %16, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %17 = "byre.alias"(%alloc) <{offset = 11020288 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %18 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg121, %17, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %17, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %19 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%17, %arg119, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %20 = "byre.alias"(%alloc) <{offset = 7380992 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %17, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%16, %19, %arg118, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %11, %16, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg114, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + %21 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %16, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg113, %14, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %16, %14, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %22 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%14, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %23 = "byre.alias"(%alloc) <{offset = 6791168 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %14, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> + %24 = "byre.alias"(%alloc) <{offset = 11120640 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %11, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %25 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%24, %arg116, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %26 = "byre.alias"(%alloc) {device = "cuda", offset = 311296 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> + %26 = "byre.alias"(%alloc) <{offset = 311296 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %24, %26) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> - %27 = "byre.alias"(%alloc) {device = "cuda", offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %27 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg109, %arg19, %27, %25, %arg171, %arg170) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg108, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %28 = "byre.alias"(%alloc) {device = "cuda", offset = 0 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %28 = "byre.alias"(%alloc) <{offset = 0 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg107, %25, %28) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg106, %arg17, %25, %22, %arg169, %arg168) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%22, %arg105, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %29 = "byre.alias"(%alloc) {device = "cuda", offset = 1376256 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %29 = "byre.alias"(%alloc) <{offset = 1376256 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %22, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %30 = "byre.alias"(%alloc) {device = "cuda", offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %30 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg101, %arg13, %30, %25, %arg162, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg100, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %31 = "byre.alias"(%alloc) {device = "cuda", offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %31 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %25, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %32 = "byre.alias"(%alloc) {device = "cuda", offset = 6590464 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + %32 = "byre.alias"(%alloc) <{offset = 6590464 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg98, %arg11, %25, %32, %arg160, %arg159) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %33 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %33 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%32, %arg97, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %34 = "byre.alias"(%alloc) {device = "cuda", offset = 671744 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> + %34 = "byre.alias"(%alloc) <{offset = 671744 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %32, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> - %35 = "byre.alias"(%alloc) {device = "cuda", offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %35 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg103, %arg15, %30, %35, %arg167, %arg166) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %36 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %36 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%35, %arg102, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %37 = "byre.alias"(%alloc) {device = "cuda", offset = 294912 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> + %37 = "byre.alias"(%alloc) <{offset = 294912 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %35, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> - %38 = "byre.alias"(%alloc) {device = "cuda", offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + %38 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg95, %arg9, %38, %36, %arg156, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg94, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %39 = "byre.alias"(%alloc) {device = "cuda", offset = 376832 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %39 = "byre.alias"(%alloc) <{offset = 598016 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg93, %36, %39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg92, %arg7, %36, %33, %arg154, %arg153) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%33, %arg91, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %40 = "byre.alias"(%alloc) {device = "cuda", offset = 524288 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %40 = "byre.alias"(%alloc) <{offset = 524288 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %33, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg89, %arg5, %33, %36, %arg150, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %41 = "byre.alias"(%alloc) {device = "cuda", offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %41 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg88, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %42 = "byre.alias"(%alloc) {device = "cuda", offset = 598016 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %42 = "byre.alias"(%alloc) <{offset = 376832 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %36, %42) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg86, %arg3, %36, %41, %arg148, %arg147) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%41, %arg85, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %43 = "byre.alias"(%alloc) {device = "cuda", offset = 450560 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %43 = "byre.alias"(%alloc) <{offset = 450560 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg84, %41, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %44 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + %44 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> byre.compute @PoolMaxGradOp_f16f16_f16(%arg83, %38, %44) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> - %45 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> - byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> + %45 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg82, %arg1, %45, %44, %arg143, %arg142) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %46 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> + %46 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %44, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> - byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> - %47 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x1000xf32, "cuda"> - byre.compute @PTXOp(%arg141, %47) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1x1000xf32, "cuda"> - %48 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000xf32, "cuda"> - byre.compute @ReduceSumOp_f32_f32(%47, %48) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32, "cuda">, memref<1000xf32, "cuda"> - byre.compute @PTXOp(%48, %arg145) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda"> - %49 = "byre.alias"(%arg141) {device = "cuda", offset = 0 : i64} : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda"> - %50 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%49, %arg139, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%50, %arg146) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> - byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown81", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown83", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown87", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> - byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown88", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%20, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> - byre.compute @PTXOp(%18, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown93", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%17, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%14, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown97", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> - byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown98", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown99", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> + byre.compute @PTXOp(%arg141, %arg145) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1000xf32, "cuda"> + %47 = "byre.alias"(%arg141) <{offset = 0 : i64}> {device = "cuda"} : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda"> + %48 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%47, %arg139, %48) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%48, %arg146) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> + byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> + byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%21, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> + byre.compute @PTXOp(%20, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%18, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%15, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> + byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> return } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/FW/10b_ptx_codegen.mlir b/compiler/test/E2E/ResNet18/FW/10b_ptx_codegen.mlir index efbda3322..54a161e09 100644 --- a/compiler/test/E2E/ResNet18/FW/10b_ptx_codegen.mlir +++ b/compiler/test/E2E/ResNet18/FW/10b_ptx_codegen.mlir @@ -4,7 +4,7 @@ module attributes {byre.container_module, gpu.container_module} { gpu.module @unified { - llvm.func @Unknown100(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.func @Unknown92(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr {llvm.noalias}, %arg11: !llvm.ptr {llvm.noalias}, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -12,8 +12,8 @@ module attributes {byre.container_module, gpu.container_module} { %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 + %7 = llvm.mlir.constant(1.000000e-01 : f32) : f32 + %8 = llvm.mlir.constant(0.899999976 : f32) : f32 %9 = llvm.mlir.constant(512 : index) : i64 %10 = nvvm.read.ptx.sreg.ctaid.x : i32 %11 = llvm.sext %10 : i32 to i64 @@ -23,408 +23,30 @@ module attributes {byre.container_module, gpu.container_module} { %15 = llvm.sext %14 : i32 to i64 %16 = llvm.mul %13, %11 : i64 %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown99(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(512 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown98(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(512 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown97(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(512 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown96(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(512 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown95(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(512 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown94(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(512 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown93(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(512 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown92(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(512 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown91(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(512 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown90(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(256 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown89(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(256 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown88(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { + %18 = nvvm.read.ptx.sreg.nctaid.x : i32 + %19 = llvm.sext %18 : i32 to i64 + %20 = llvm.mul %13, %19 : i64 + llvm.br ^bb1(%17 : i64) + ^bb1(%21: i64): // 2 preds: ^bb0, ^bb2 + %22 = llvm.icmp "slt" %21, %9 : i64 + llvm.cond_br %22, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %23 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %24 = llvm.getelementptr %arg6[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %25 = llvm.load %24 : !llvm.ptr -> f32 + %26 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %27 = llvm.load %26 : !llvm.ptr -> f32 + %28 = llvm.fmul %25, %8 : f32 + %29 = llvm.fmul %27, %7 : f32 + %30 = llvm.fadd %29, %28 : f32 + %31 = llvm.getelementptr %arg11[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %30, %31 : f32, !llvm.ptr + %32 = llvm.add %21, %20 : i64 + llvm.br ^bb1(%32 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown82(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr {llvm.noalias}, %arg11: !llvm.ptr {llvm.noalias}, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -432,8 +54,8 @@ module attributes {byre.container_module, gpu.container_module} { %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 + %7 = llvm.mlir.constant(1.000000e-01 : f32) : f32 + %8 = llvm.mlir.constant(0.899999976 : f32) : f32 %9 = llvm.mlir.constant(256 : index) : i64 %10 = nvvm.read.ptx.sreg.ctaid.x : i32 %11 = llvm.sext %10 : i32 to i64 @@ -443,23 +65,30 @@ module attributes {byre.container_module, gpu.container_module} { %15 = llvm.sext %14 : i32 to i64 %16 = llvm.mul %13, %11 : i64 %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown87(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { + %18 = nvvm.read.ptx.sreg.nctaid.x : i32 + %19 = llvm.sext %18 : i32 to i64 + %20 = llvm.mul %13, %19 : i64 + llvm.br ^bb1(%17 : i64) + ^bb1(%21: i64): // 2 preds: ^bb0, ^bb2 + %22 = llvm.icmp "slt" %21, %9 : i64 + llvm.cond_br %22, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %23 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %24 = llvm.getelementptr %arg6[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %25 = llvm.load %24 : !llvm.ptr -> f32 + %26 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %27 = llvm.load %26 : !llvm.ptr -> f32 + %28 = llvm.fmul %25, %8 : f32 + %29 = llvm.fmul %27, %7 : f32 + %30 = llvm.fadd %29, %28 : f32 + %31 = llvm.getelementptr %arg11[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %30, %31 : f32, !llvm.ptr + %32 = llvm.add %21, %20 : i64 + llvm.br ^bb1(%32 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown72(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr {llvm.noalias}, %arg11: !llvm.ptr {llvm.noalias}, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -467,9 +96,9 @@ module attributes {byre.container_module, gpu.container_module} { %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(256 : index) : i64 + %7 = llvm.mlir.constant(1.000000e-01 : f32) : f32 + %8 = llvm.mlir.constant(0.899999976 : f32) : f32 + %9 = llvm.mlir.constant(128 : index) : i64 %10 = nvvm.read.ptx.sreg.ctaid.x : i32 %11 = llvm.sext %10 : i32 to i64 %12 = nvvm.read.ptx.sreg.ntid.x : i32 @@ -478,23 +107,30 @@ module attributes {byre.container_module, gpu.container_module} { %15 = llvm.sext %14 : i32 to i64 %16 = llvm.mul %13, %11 : i64 %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown86(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { + %18 = nvvm.read.ptx.sreg.nctaid.x : i32 + %19 = llvm.sext %18 : i32 to i64 + %20 = llvm.mul %13, %19 : i64 + llvm.br ^bb1(%17 : i64) + ^bb1(%21: i64): // 2 preds: ^bb0, ^bb2 + %22 = llvm.icmp "slt" %21, %9 : i64 + llvm.cond_br %22, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %23 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %24 = llvm.getelementptr %arg6[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %25 = llvm.load %24 : !llvm.ptr -> f32 + %26 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %27 = llvm.load %26 : !llvm.ptr -> f32 + %28 = llvm.fmul %25, %8 : f32 + %29 = llvm.fmul %27, %7 : f32 + %30 = llvm.fadd %29, %28 : f32 + %31 = llvm.getelementptr %arg11[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %30, %31 : f32, !llvm.ptr + %32 = llvm.add %21, %20 : i64 + llvm.br ^bb1(%32 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown62(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr {llvm.noalias}, %arg11: !llvm.ptr {llvm.noalias}, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -502,9 +138,9 @@ module attributes {byre.container_module, gpu.container_module} { %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(256 : index) : i64 + %7 = llvm.mlir.constant(1.000000e-01 : f32) : f32 + %8 = llvm.mlir.constant(0.899999976 : f32) : f32 + %9 = llvm.mlir.constant(64 : index) : i64 %10 = nvvm.read.ptx.sreg.ctaid.x : i32 %11 = llvm.sext %10 : i32 to i64 %12 = nvvm.read.ptx.sreg.ntid.x : i32 @@ -513,2246 +149,182 @@ module attributes {byre.container_module, gpu.container_module} { %15 = llvm.sext %14 : i32 to i64 %16 = llvm.mul %13, %11 : i64 %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown85(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { + %18 = nvvm.read.ptx.sreg.nctaid.x : i32 + %19 = llvm.sext %18 : i32 to i64 + %20 = llvm.mul %13, %19 : i64 + llvm.br ^bb1(%17 : i64) + ^bb1(%21: i64): // 2 preds: ^bb0, ^bb2 + %22 = llvm.icmp "slt" %21, %9 : i64 + llvm.cond_br %22, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %23 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %24 = llvm.getelementptr %arg6[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %25 = llvm.load %24 : !llvm.ptr -> f32 + %26 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %27 = llvm.load %26 : !llvm.ptr -> f32 + %28 = llvm.fmul %25, %8 : f32 + %29 = llvm.fmul %27, %7 : f32 + %30 = llvm.fadd %29, %28 : f32 + %31 = llvm.getelementptr %arg11[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %30, %31 : f32, !llvm.ptr + %32 = llvm.add %21, %20 : i64 + llvm.br ^bb1(%32 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown61(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr {llvm.noalias}, %arg13: !llvm.ptr {llvm.noalias}, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(256 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 + %3 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %4 = llvm.insertvalue %arg5, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %5 = llvm.insertvalue %arg6, %4[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %6 = llvm.insertvalue %arg7, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %7 = llvm.insertvalue %arg8, %6[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %8 = llvm.insertvalue %arg12, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %9 = llvm.insertvalue %arg13, %8[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %10 = llvm.insertvalue %arg14, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %11 = llvm.insertvalue %arg15, %10[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %12 = llvm.mlir.constant(0 : index) : i64 + %13 = llvm.mlir.constant(1000 : index) : i64 + %14 = nvvm.read.ptx.sreg.ctaid.x : i32 %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown84(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(256 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown83(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(256 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown82(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(256 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown81(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(256 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown80(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(128 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown79(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(128 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown78(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(128 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown77(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(128 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown76(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(128 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown75(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(128 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown74(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(128 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown73(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(128 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown72(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(128 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown71(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(128 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown70(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(64 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown69(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(64 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown68(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(64 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown67(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(64 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown66(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(64 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown65(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(64 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown64(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(64 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown63(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(64 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown62(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(64 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown61(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %7 = llvm.mlir.constant(0.899999976 : f32) : f32 - %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32 - %9 = llvm.mlir.constant(64 : index) : i64 - %10 = nvvm.read.ptx.sreg.ctaid.x : i32 - %11 = llvm.sext %10 : i32 to i64 - %12 = nvvm.read.ptx.sreg.ntid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.tid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = llvm.mul %13, %11 : i64 - %17 = llvm.add %15, %16 : i64 - %18 = llvm.icmp "slt" %17, %9 : i64 - llvm.cond_br %18, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %20 = llvm.load %19 : !llvm.ptr -> f32 - %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %22 = llvm.load %21 : !llvm.ptr -> f32 - %23 = llvm.fmul %22, %7 : f32 - %24 = llvm.fmul %20, %8 : f32 - %25 = llvm.fadd %24, %23 : f32 - %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %25, %26 : f32, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown60(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr, %arg13: !llvm.ptr, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - %3 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %4 = llvm.insertvalue %arg5, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %5 = llvm.insertvalue %arg6, %4[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %6 = llvm.insertvalue %arg7, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %7 = llvm.insertvalue %arg8, %6[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %8 = llvm.insertvalue %arg12, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %9 = llvm.insertvalue %arg13, %8[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %10 = llvm.insertvalue %arg14, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %11 = llvm.insertvalue %arg15, %10[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %12 = llvm.mlir.constant(0 : index) : i64 - %13 = llvm.mlir.constant(1000 : index) : i64 - %14 = nvvm.read.ptx.sreg.ctaid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = nvvm.read.ptx.sreg.ntid.x : i32 - %17 = llvm.sext %16 : i32 to i64 - %18 = nvvm.read.ptx.sreg.tid.x : i32 - %19 = llvm.sext %18 : i32 to i64 - %20 = llvm.mul %17, %15 : i64 - %21 = llvm.add %19, %20 : i64 - %22 = llvm.icmp "slt" %21, %13 : i64 - llvm.cond_br %22, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %23 = llvm.mul %12, %13 : i64 - %24 = llvm.add %23, %21 : i64 - %25 = llvm.getelementptr %arg6[%24] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %26 = llvm.load %25 : !llvm.ptr -> f16 - %27 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %28 = llvm.load %27 : !llvm.ptr -> f32 - %29 = llvm.fptrunc %28 : f32 to f16 - %30 = llvm.fadd %26, %29 : f16 - %31 = llvm.getelementptr %arg13[%24] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %30, %31 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown59(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %9 = llvm.mlir.constant(0 : index) : i64 - %10 = llvm.mlir.constant(512000 : index) : i64 - %11 = llvm.mlir.constant(512 : index) : i64 - %12 = llvm.mlir.constant(-1 : index) : i64 - %13 = nvvm.read.ptx.sreg.ctaid.x : i32 - %14 = llvm.sext %13 : i32 to i64 - %15 = nvvm.read.ptx.sreg.ntid.x : i32 - %16 = llvm.sext %15 : i32 to i64 - %17 = nvvm.read.ptx.sreg.tid.x : i32 - %18 = llvm.sext %17 : i32 to i64 - %19 = llvm.mul %16, %14 : i64 - %20 = llvm.add %18, %19 : i64 - %21 = llvm.icmp "slt" %20, %10 : i64 - llvm.cond_br %21, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %22 = llvm.srem %20, %11 : i64 - %23 = llvm.icmp "slt" %22, %9 : i64 - %24 = llvm.add %22, %11 : i64 - %25 = llvm.select %23, %24, %22 : i1, i64 - %26 = llvm.icmp "slt" %20, %9 : i64 - %27 = llvm.sub %12, %20 : i64 - %28 = llvm.select %26, %27, %20 : i1, i64 - %29 = llvm.sdiv %28, %11 : i64 - %30 = llvm.sub %12, %29 : i64 - %31 = llvm.select %26, %30, %29 : i1, i64 - %32 = llvm.mul %31, %11 : i64 - %33 = llvm.add %32, %25 : i64 - %34 = llvm.getelementptr %arg1[%33] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %35 = llvm.load %34 : !llvm.ptr -> f32 - %36 = llvm.fptrunc %35 : f32 to f16 - %37 = llvm.getelementptr %arg8[%33] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %36, %37 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown58(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %9 = llvm.mlir.constant(2.040100e-02 : f16) : f16 - %10 = llvm.mlir.constant(0 : index) : i64 - %11 = llvm.mlir.constant(512 : index) : i64 - %12 = nvvm.read.ptx.sreg.ctaid.x : i32 - %13 = llvm.sext %12 : i32 to i64 - %14 = nvvm.read.ptx.sreg.ntid.x : i32 - %15 = llvm.sext %14 : i32 to i64 - %16 = nvvm.read.ptx.sreg.tid.x : i32 - %17 = llvm.sext %16 : i32 to i64 - %18 = llvm.mul %15, %13 : i64 - %19 = llvm.add %17, %18 : i64 - %20 = llvm.icmp "slt" %19, %11 : i64 - llvm.cond_br %20, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %21 = llvm.mul %10, %11 : i64 - %22 = llvm.add %21, %19 : i64 - %23 = llvm.getelementptr %arg1[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %24 = llvm.load %23 : !llvm.ptr -> f16 - %25 = llvm.fmul %24, %9 : f16 - %26 = llvm.getelementptr %arg8[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %25, %26 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown57(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(25088 : index) : i64 - %28 = llvm.mlir.constant(7 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(49 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fadd %67, %69 : f16 - %71 = llvm.intr.maxnum(%70, %25) : (f16, f16) -> f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown55(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(2359296 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(512 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(4608 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown54(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %18 = llvm.mlir.constant(0 : index) : i64 - %19 = llvm.mlir.constant(25088 : index) : i64 - %20 = llvm.mlir.constant(7 : index) : i64 - %21 = llvm.mlir.constant(-1 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %19 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %20 : i64 - %32 = llvm.icmp "slt" %31, %18 : i64 - %33 = llvm.add %31, %20 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %18 : i64 - %36 = llvm.sub %21, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %20 : i64 - %39 = llvm.sub %21, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %20 : i64 - %42 = llvm.icmp "slt" %41, %18 : i64 - %43 = llvm.add %41, %20 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %18 : i64 - %46 = llvm.sub %21, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %20 : i64 - %49 = llvm.sub %21, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.mul %18, %19 : i64 - %52 = llvm.mlir.constant(49 : index) : i64 - %53 = llvm.mul %50, %52 : i64 - %54 = llvm.add %51, %53 : i64 - %55 = llvm.mul %44, %20 : i64 - %56 = llvm.add %54, %55 : i64 - %57 = llvm.add %56, %34 : i64 - %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %59 = llvm.load %58 : !llvm.ptr -> f16 - %60 = llvm.intr.maxnum(%59, %17) : (f16, f16) -> f16 - %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %60, %61 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown52(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(2359296 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(512 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(4608 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown51(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(25088 : index) : i64 - %28 = llvm.mlir.constant(7 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(49 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fadd %67, %69 : f16 - %71 = llvm.intr.maxnum(%70, %25) : (f16, f16) -> f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown49(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(2359296 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(512 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(4608 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown48(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %18 = llvm.mlir.constant(0 : index) : i64 - %19 = llvm.mlir.constant(25088 : index) : i64 - %20 = llvm.mlir.constant(7 : index) : i64 - %21 = llvm.mlir.constant(-1 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %19 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %20 : i64 - %32 = llvm.icmp "slt" %31, %18 : i64 - %33 = llvm.add %31, %20 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %18 : i64 - %36 = llvm.sub %21, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %20 : i64 - %39 = llvm.sub %21, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %20 : i64 - %42 = llvm.icmp "slt" %41, %18 : i64 - %43 = llvm.add %41, %20 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %18 : i64 - %46 = llvm.sub %21, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %20 : i64 - %49 = llvm.sub %21, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.mul %18, %19 : i64 - %52 = llvm.mlir.constant(49 : index) : i64 - %53 = llvm.mul %50, %52 : i64 - %54 = llvm.add %51, %53 : i64 - %55 = llvm.mul %44, %20 : i64 - %56 = llvm.add %54, %55 : i64 - %57 = llvm.add %56, %34 : i64 - %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %59 = llvm.load %58 : !llvm.ptr -> f16 - %60 = llvm.intr.maxnum(%59, %17) : (f16, f16) -> f16 - %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %60, %61 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown46(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(1179648 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(256 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(2304 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown44(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(131072 : index) : i64 - %19 = llvm.mlir.constant(256 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = nvvm.read.ptx.sreg.ctaid.x : i32 - %22 = llvm.sext %21 : i32 to i64 - %23 = nvvm.read.ptx.sreg.ntid.x : i32 - %24 = llvm.sext %23 : i32 to i64 - %25 = nvvm.read.ptx.sreg.tid.x : i32 - %26 = llvm.sext %25 : i32 to i64 - %27 = llvm.mul %24, %22 : i64 - %28 = llvm.add %26, %27 : i64 - %29 = llvm.icmp "slt" %28, %18 : i64 - llvm.cond_br %29, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %30 = llvm.srem %28, %19 : i64 - %31 = llvm.icmp "slt" %30, %17 : i64 - %32 = llvm.add %30, %19 : i64 - %33 = llvm.select %31, %32, %30 : i1, i64 - %34 = llvm.icmp "slt" %28, %17 : i64 - %35 = llvm.sub %20, %28 : i64 - %36 = llvm.select %34, %35, %28 : i1, i64 - %37 = llvm.sdiv %36, %19 : i64 - %38 = llvm.sub %20, %37 : i64 - %39 = llvm.select %34, %38, %37 : i1, i64 - %40 = llvm.mul %39, %19 : i64 - %41 = llvm.add %40, %33 : i64 - %42 = llvm.add %41, %17 : i64 - %43 = llvm.add %42, %17 : i64 - %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %45 = llvm.load %44 : !llvm.ptr -> f32 - %46 = llvm.fptrunc %45 : f32 to f16 - %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %46, %47 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown43(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(50176 : index) : i64 - %28 = llvm.mlir.constant(14 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(196 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fadd %67, %69 : f16 - %71 = llvm.intr.maxnum(%70, %25) : (f16, f16) -> f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown41(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(589824 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(256 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(2304 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown40(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %18 = llvm.mlir.constant(0 : index) : i64 - %19 = llvm.mlir.constant(50176 : index) : i64 - %20 = llvm.mlir.constant(14 : index) : i64 - %21 = llvm.mlir.constant(-1 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %19 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %20 : i64 - %32 = llvm.icmp "slt" %31, %18 : i64 - %33 = llvm.add %31, %20 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %18 : i64 - %36 = llvm.sub %21, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %20 : i64 - %39 = llvm.sub %21, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %20 : i64 - %42 = llvm.icmp "slt" %41, %18 : i64 - %43 = llvm.add %41, %20 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %18 : i64 - %46 = llvm.sub %21, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %20 : i64 - %49 = llvm.sub %21, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.mul %18, %19 : i64 - %52 = llvm.mlir.constant(196 : index) : i64 - %53 = llvm.mul %50, %52 : i64 - %54 = llvm.add %51, %53 : i64 - %55 = llvm.mul %44, %20 : i64 - %56 = llvm.add %54, %55 : i64 - %57 = llvm.add %56, %34 : i64 - %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %59 = llvm.load %58 : !llvm.ptr -> f16 - %60 = llvm.intr.maxnum(%59, %17) : (f16, f16) -> f16 - %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %60, %61 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown38(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(589824 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(256 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(2304 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown37(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(50176 : index) : i64 - %28 = llvm.mlir.constant(14 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(196 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fadd %67, %69 : f16 - %71 = llvm.intr.maxnum(%70, %25) : (f16, f16) -> f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown35(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(589824 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(256 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(2304 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown34(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %18 = llvm.mlir.constant(0 : index) : i64 - %19 = llvm.mlir.constant(50176 : index) : i64 - %20 = llvm.mlir.constant(14 : index) : i64 - %21 = llvm.mlir.constant(-1 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 + %16 = nvvm.read.ptx.sreg.ntid.x : i32 + %17 = llvm.sext %16 : i32 to i64 + %18 = nvvm.read.ptx.sreg.tid.x : i32 + %19 = llvm.sext %18 : i32 to i64 + %20 = llvm.mul %17, %15 : i64 + %21 = llvm.add %19, %20 : i64 + %22 = nvvm.read.ptx.sreg.nctaid.x : i32 %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %19 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %20 : i64 - %32 = llvm.icmp "slt" %31, %18 : i64 - %33 = llvm.add %31, %20 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %18 : i64 - %36 = llvm.sub %21, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %20 : i64 - %39 = llvm.sub %21, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %20 : i64 - %42 = llvm.icmp "slt" %41, %18 : i64 - %43 = llvm.add %41, %20 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %18 : i64 - %46 = llvm.sub %21, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %20 : i64 - %49 = llvm.sub %21, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.mul %18, %19 : i64 - %52 = llvm.mlir.constant(196 : index) : i64 - %53 = llvm.mul %50, %52 : i64 - %54 = llvm.add %51, %53 : i64 - %55 = llvm.mul %44, %20 : i64 - %56 = llvm.add %54, %55 : i64 - %57 = llvm.add %56, %34 : i64 - %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %59 = llvm.load %58 : !llvm.ptr -> f16 - %60 = llvm.intr.maxnum(%59, %17) : (f16, f16) -> f16 - %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %60, %61 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown32(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %24 = llvm.mul %17, %23 : i64 + llvm.br ^bb1(%21 : i64) + ^bb1(%25: i64): // 2 preds: ^bb0, ^bb2 + %26 = llvm.icmp "slt" %25, %13 : i64 + llvm.cond_br %26, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %27 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %28 = llvm.mlir.constant(1 : index) : i64 + %29 = llvm.getelementptr %arg1[%25] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %30 = llvm.load %29 : !llvm.ptr -> f32 + %31 = llvm.insertvalue %25, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %32 = llvm.insertvalue %28, %31[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %33 = llvm.getelementptr %arg6[%25] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %34 = llvm.mul %12, %13 : i64 + %35 = llvm.add %34, %12 : i64 + %36 = llvm.getelementptr %33[%35] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %37 = llvm.load %36 : !llvm.ptr -> f16 + %38 = llvm.fptrunc %30 : f32 to f16 + %39 = llvm.fadd %37, %38 : f16 + %40 = llvm.insertvalue %25, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %41 = llvm.insertvalue %28, %40[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %42 = llvm.getelementptr %arg13[%25] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %43 = llvm.getelementptr %42[%35] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %39, %43 : f16, !llvm.ptr + %44 = llvm.add %25, %24 : i64 + llvm.br ^bb1(%44 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown60(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %9 = llvm.mlir.constant(0 : index) : i64 + %10 = llvm.mlir.constant(512000 : index) : i64 + %11 = nvvm.read.ptx.sreg.ctaid.x : i32 + %12 = llvm.sext %11 : i32 to i64 + %13 = nvvm.read.ptx.sreg.ntid.x : i32 + %14 = llvm.sext %13 : i32 to i64 + %15 = nvvm.read.ptx.sreg.tid.x : i32 + %16 = llvm.sext %15 : i32 to i64 + %17 = llvm.mul %14, %12 : i64 + %18 = llvm.add %16, %17 : i64 + %19 = nvvm.read.ptx.sreg.nctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = llvm.mul %14, %20 : i64 + llvm.br ^bb1(%18 : i64) + ^bb1(%22: i64): // 2 preds: ^bb0, ^bb2 + %23 = llvm.icmp "slt" %22, %10 : i64 + llvm.cond_br %23, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %25 = llvm.insertvalue %22, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %26 = llvm.mlir.constant(1 : index) : i64 + %27 = llvm.insertvalue %26, %25[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %28 = llvm.mlir.constant(512 : index) : i64 + %29 = llvm.getelementptr %arg1[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %30 = llvm.mul %9, %28 : i64 + %31 = llvm.add %30, %9 : i64 + %32 = llvm.getelementptr %29[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %33 = llvm.load %32 : !llvm.ptr -> f32 + %34 = llvm.fptrunc %33 : f32 to f16 + %35 = llvm.insertvalue %22, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %36 = llvm.insertvalue %26, %35[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %37 = llvm.getelementptr %arg8[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %38 = llvm.getelementptr %37[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %34, %38 : f16, !llvm.ptr + %39 = llvm.add %22, %21 : i64 + llvm.br ^bb1(%39 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown59(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %9 = llvm.mlir.constant(2.040100e-02 : f16) : f16 + %10 = llvm.mlir.constant(0 : index) : i64 + %11 = llvm.mlir.constant(512 : index) : i64 + %12 = nvvm.read.ptx.sreg.ctaid.x : i32 + %13 = llvm.sext %12 : i32 to i64 + %14 = nvvm.read.ptx.sreg.ntid.x : i32 + %15 = llvm.sext %14 : i32 to i64 + %16 = nvvm.read.ptx.sreg.tid.x : i32 + %17 = llvm.sext %16 : i32 to i64 + %18 = llvm.mul %15, %13 : i64 + %19 = llvm.add %17, %18 : i64 + %20 = nvvm.read.ptx.sreg.nctaid.x : i32 + %21 = llvm.sext %20 : i32 to i64 + %22 = llvm.mul %15, %21 : i64 + llvm.br ^bb1(%19 : i64) + ^bb1(%23: i64): // 2 preds: ^bb0, ^bb2 + %24 = llvm.icmp "slt" %23, %11 : i64 + llvm.cond_br %24, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %25 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %26 = llvm.insertvalue %23, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %27 = llvm.mlir.constant(1 : index) : i64 + %28 = llvm.insertvalue %27, %26[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %29 = llvm.getelementptr %arg1[%23] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %30 = llvm.mul %10, %11 : i64 + %31 = llvm.add %30, %10 : i64 + %32 = llvm.getelementptr %29[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %33 = llvm.load %32 : !llvm.ptr -> f16 + %34 = llvm.fmul %33, %9 : f16 + %35 = llvm.insertvalue %23, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %36 = llvm.insertvalue %27, %35[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %37 = llvm.getelementptr %arg8[%23] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %38 = llvm.getelementptr %37[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %34, %38 : f16, !llvm.ptr + %39 = llvm.add %23, %22 : i64 + llvm.br ^bb1(%39 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown51(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2770,70 +342,78 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(294912 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(128 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(1152 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown30(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %25 = llvm.mlir.constant(25088 : index) : i64 + %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %27 = llvm.mlir.constant(0 : index) : i64 + %28 = nvvm.read.ptx.sreg.ctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = nvvm.read.ptx.sreg.ntid.x : i32 + %31 = llvm.sext %30 : i32 to i64 + %32 = nvvm.read.ptx.sreg.tid.x : i32 + %33 = llvm.sext %32 : i32 to i64 + %34 = llvm.mul %31, %29 : i64 + %35 = llvm.add %33, %34 : i64 + %36 = nvvm.read.ptx.sreg.nctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = llvm.mul %31, %37 : i64 + llvm.br ^bb1(%35 : i64) + ^bb1(%39: i64): // 2 preds: ^bb0, ^bb2 + %40 = llvm.icmp "slt" %39, %25 : i64 + llvm.cond_br %40, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %43 = llvm.mlir.constant(1 : index) : i64 + %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.mlir.constant(49 : index) : i64 + %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.mlir.constant(7 : index) : i64 + %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %52 = llvm.mul %27, %25 : i64 + %53 = llvm.mul %27, %47 : i64 + %54 = llvm.add %52, %53 : i64 + %55 = llvm.mul %27, %50 : i64 + %56 = llvm.add %54, %55 : i64 + %57 = llvm.add %56, %27 : i64 + %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.load %58 : !llvm.ptr -> f16 + %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %68 = llvm.load %67 : !llvm.ptr -> f16 + %69 = llvm.fadd %59, %68 : f16 + %70 = llvm.intr.maximum(%69, %26) : (f16, f16) -> f16 + %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %70, %78 : f16, !llvm.ptr + %79 = llvm.add %39, %38 : i64 + llvm.br ^bb1(%79 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown49(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2852,125 +432,59 @@ module attributes {byre.container_module, gpu.container_module} { %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(32768 : index) : i64 - %19 = llvm.mlir.constant(128 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = nvvm.read.ptx.sreg.ctaid.x : i32 + %18 = llvm.mlir.constant(2359296 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 %22 = llvm.sext %21 : i32 to i64 - %23 = nvvm.read.ptx.sreg.ntid.x : i32 + %23 = nvvm.read.ptx.sreg.tid.x : i32 %24 = llvm.sext %23 : i32 to i64 - %25 = nvvm.read.ptx.sreg.tid.x : i32 - %26 = llvm.sext %25 : i32 to i64 - %27 = llvm.mul %24, %22 : i64 - %28 = llvm.add %26, %27 : i64 - %29 = llvm.icmp "slt" %28, %18 : i64 - llvm.cond_br %29, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %30 = llvm.srem %28, %19 : i64 - %31 = llvm.icmp "slt" %30, %17 : i64 - %32 = llvm.add %30, %19 : i64 - %33 = llvm.select %31, %32, %30 : i1, i64 - %34 = llvm.icmp "slt" %28, %17 : i64 - %35 = llvm.sub %20, %28 : i64 - %36 = llvm.select %34, %35, %28 : i1, i64 - %37 = llvm.sdiv %36, %19 : i64 - %38 = llvm.sub %20, %37 : i64 - %39 = llvm.select %34, %38, %37 : i1, i64 - %40 = llvm.mul %39, %19 : i64 - %41 = llvm.add %40, %33 : i64 - %42 = llvm.add %41, %17 : i64 - %43 = llvm.add %42, %17 : i64 - %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %45 = llvm.load %44 : !llvm.ptr -> f32 - %46 = llvm.fptrunc %45 : f32 to f16 - %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %46, %47 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown29(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { - %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(100352 : index) : i64 - %28 = llvm.mlir.constant(28 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(784 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fadd %67, %69 : f16 - %71 = llvm.intr.maxnum(%70, %25) : (f16, f16) -> f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown27(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(4608 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %51 = llvm.load %50 : !llvm.ptr -> f32 + %52 = llvm.fptrunc %51 : f32 to f16 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown48(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -2988,70 +502,60 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(147456 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(128 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 + %17 = llvm.mlir.constant(25088 : index) : i64 + %18 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %19 = llvm.mlir.constant(0 : index) : i64 + %20 = nvvm.read.ptx.sreg.ctaid.x : i32 + %21 = llvm.sext %20 : i32 to i64 + %22 = nvvm.read.ptx.sreg.ntid.x : i32 %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 + %24 = nvvm.read.ptx.sreg.tid.x : i32 %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 + %26 = llvm.mul %23, %21 : i64 + %27 = llvm.add %25, %26 : i64 + %28 = nvvm.read.ptx.sreg.nctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = llvm.mul %23, %29 : i64 + llvm.br ^bb1(%27 : i64) + ^bb1(%31: i64): // 2 preds: ^bb0, ^bb2 %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(1152 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown26(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.cond_br %32, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %33 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %34 = llvm.insertvalue %31, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %35 = llvm.mlir.constant(1 : index) : i64 + %36 = llvm.insertvalue %35, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %37 = llvm.insertvalue %17, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %35, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(49 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %35, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(7 : index) : i64 + %43 = llvm.getelementptr %arg1[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %19, %17 : i64 + %45 = llvm.mul %19, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %19, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %19 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.intr.maximum(%51, %18) : (f16, f16) -> f16 + %53 = llvm.insertvalue %31, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %35, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %17, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %35, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %35, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %31, %30 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown46(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3069,59 +573,60 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %18 = llvm.mlir.constant(0 : index) : i64 - %19 = llvm.mlir.constant(100352 : index) : i64 - %20 = llvm.mlir.constant(28 : index) : i64 - %21 = llvm.mlir.constant(-1 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %19 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %20 : i64 - %32 = llvm.icmp "slt" %31, %18 : i64 - %33 = llvm.add %31, %20 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %18 : i64 - %36 = llvm.sub %21, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %20 : i64 - %39 = llvm.sub %21, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %20 : i64 - %42 = llvm.icmp "slt" %41, %18 : i64 - %43 = llvm.add %41, %20 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %18 : i64 - %46 = llvm.sub %21, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %20 : i64 - %49 = llvm.sub %21, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.mul %18, %19 : i64 - %52 = llvm.mlir.constant(784 : index) : i64 - %53 = llvm.mul %50, %52 : i64 - %54 = llvm.add %51, %53 : i64 - %55 = llvm.mul %44, %20 : i64 - %56 = llvm.add %54, %55 : i64 - %57 = llvm.add %56, %34 : i64 - %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %59 = llvm.load %58 : !llvm.ptr -> f16 - %60 = llvm.intr.maxnum(%59, %17) : (f16, f16) -> f16 - %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %60, %61 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown24(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %17 = llvm.mlir.constant(0 : index) : i64 + %18 = llvm.mlir.constant(1179648 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(2304 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %51 = llvm.load %50 : !llvm.ptr -> f32 + %52 = llvm.fptrunc %51 : f32 to f16 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown44(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3139,70 +644,56 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(147456 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(128 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(1152 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown23(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { + %17 = llvm.mlir.constant(131072 : index) : i64 + %18 = llvm.mlir.constant(0 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %17 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(256 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %42 = llvm.mul %18, %36 : i64 + %43 = llvm.add %42, %18 : i64 + %44 = llvm.add %43, %18 : i64 + %45 = llvm.add %44, %18 : i64 + %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %47 = llvm.load %46 : !llvm.ptr -> f32 + %48 = llvm.fptrunc %47 : f32 to f16 + %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %48, %56 : f16, !llvm.ptr + %57 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%57 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown37(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3228,62 +719,70 @@ module attributes {byre.container_module, gpu.container_module} { %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(100352 : index) : i64 - %28 = llvm.mlir.constant(28 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 + %25 = llvm.mlir.constant(50176 : index) : i64 + %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %27 = llvm.mlir.constant(0 : index) : i64 + %28 = nvvm.read.ptx.sreg.ctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = nvvm.read.ptx.sreg.ntid.x : i32 %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 + %32 = nvvm.read.ptx.sreg.tid.x : i32 %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(784 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fadd %67, %69 : f16 - %71 = llvm.intr.maxnum(%70, %25) : (f16, f16) -> f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown21(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %34 = llvm.mul %31, %29 : i64 + %35 = llvm.add %33, %34 : i64 + %36 = nvvm.read.ptx.sreg.nctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = llvm.mul %31, %37 : i64 + llvm.br ^bb1(%35 : i64) + ^bb1(%39: i64): // 2 preds: ^bb0, ^bb2 + %40 = llvm.icmp "slt" %39, %25 : i64 + llvm.cond_br %40, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %43 = llvm.mlir.constant(1 : index) : i64 + %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.mlir.constant(196 : index) : i64 + %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.mlir.constant(14 : index) : i64 + %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %52 = llvm.mul %27, %25 : i64 + %53 = llvm.mul %27, %47 : i64 + %54 = llvm.add %52, %53 : i64 + %55 = llvm.mul %27, %50 : i64 + %56 = llvm.add %54, %55 : i64 + %57 = llvm.add %56, %27 : i64 + %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.load %58 : !llvm.ptr -> f16 + %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %68 = llvm.load %67 : !llvm.ptr -> f16 + %69 = llvm.fadd %59, %68 : f16 + %70 = llvm.intr.maximum(%69, %26) : (f16, f16) -> f16 + %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %70, %78 : f16, !llvm.ptr + %79 = llvm.add %39, %38 : i64 + llvm.br ^bb1(%79 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown35(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3302,69 +801,59 @@ module attributes {byre.container_module, gpu.container_module} { %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(147456 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(128 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(1152 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown20(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %18 = llvm.mlir.constant(589824 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(2304 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %51 = llvm.load %50 : !llvm.ptr -> f32 + %52 = llvm.fptrunc %51 : f32 to f16 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown34(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3382,59 +871,60 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %18 = llvm.mlir.constant(0 : index) : i64 - %19 = llvm.mlir.constant(100352 : index) : i64 - %20 = llvm.mlir.constant(28 : index) : i64 - %21 = llvm.mlir.constant(-1 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 + %17 = llvm.mlir.constant(50176 : index) : i64 + %18 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %19 = llvm.mlir.constant(0 : index) : i64 + %20 = nvvm.read.ptx.sreg.ctaid.x : i32 + %21 = llvm.sext %20 : i32 to i64 + %22 = nvvm.read.ptx.sreg.ntid.x : i32 %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 + %24 = nvvm.read.ptx.sreg.tid.x : i32 %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %19 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %20 : i64 - %32 = llvm.icmp "slt" %31, %18 : i64 - %33 = llvm.add %31, %20 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %18 : i64 - %36 = llvm.sub %21, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %20 : i64 - %39 = llvm.sub %21, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %20 : i64 - %42 = llvm.icmp "slt" %41, %18 : i64 - %43 = llvm.add %41, %20 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %18 : i64 - %46 = llvm.sub %21, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %20 : i64 - %49 = llvm.sub %21, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.mul %18, %19 : i64 - %52 = llvm.mlir.constant(784 : index) : i64 - %53 = llvm.mul %50, %52 : i64 - %54 = llvm.add %51, %53 : i64 - %55 = llvm.mul %44, %20 : i64 - %56 = llvm.add %54, %55 : i64 - %57 = llvm.add %56, %34 : i64 - %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %59 = llvm.load %58 : !llvm.ptr -> f16 - %60 = llvm.intr.maxnum(%59, %17) : (f16, f16) -> f16 - %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %60, %61 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown18(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %26 = llvm.mul %23, %21 : i64 + %27 = llvm.add %25, %26 : i64 + %28 = nvvm.read.ptx.sreg.nctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = llvm.mul %23, %29 : i64 + llvm.br ^bb1(%27 : i64) + ^bb1(%31: i64): // 2 preds: ^bb0, ^bb2 + %32 = llvm.icmp "slt" %31, %17 : i64 + llvm.cond_br %32, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %33 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %34 = llvm.insertvalue %31, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %35 = llvm.mlir.constant(1 : index) : i64 + %36 = llvm.insertvalue %35, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %37 = llvm.insertvalue %17, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %35, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(196 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %35, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(14 : index) : i64 + %43 = llvm.getelementptr %arg1[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %19, %17 : i64 + %45 = llvm.mul %19, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %19, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %19 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.intr.maximum(%51, %18) : (f16, f16) -> f16 + %53 = llvm.insertvalue %31, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %35, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %17, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %35, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %35, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %31, %30 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown32(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3453,69 +943,59 @@ module attributes {byre.container_module, gpu.container_module} { %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(73728 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(64 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(576 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown16(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %18 = llvm.mlir.constant(294912 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(1152 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %51 = llvm.load %50 : !llvm.ptr -> f32 + %52 = llvm.fptrunc %51 : f32 to f16 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown30(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3533,45 +1013,56 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(8192 : index) : i64 - %19 = llvm.mlir.constant(64 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = nvvm.read.ptx.sreg.ctaid.x : i32 + %17 = llvm.mlir.constant(32768 : index) : i64 + %18 = llvm.mlir.constant(0 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 %22 = llvm.sext %21 : i32 to i64 - %23 = nvvm.read.ptx.sreg.ntid.x : i32 + %23 = nvvm.read.ptx.sreg.tid.x : i32 %24 = llvm.sext %23 : i32 to i64 - %25 = nvvm.read.ptx.sreg.tid.x : i32 - %26 = llvm.sext %25 : i32 to i64 - %27 = llvm.mul %24, %22 : i64 - %28 = llvm.add %26, %27 : i64 - %29 = llvm.icmp "slt" %28, %18 : i64 - llvm.cond_br %29, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %30 = llvm.srem %28, %19 : i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 %31 = llvm.icmp "slt" %30, %17 : i64 - %32 = llvm.add %30, %19 : i64 - %33 = llvm.select %31, %32, %30 : i1, i64 - %34 = llvm.icmp "slt" %28, %17 : i64 - %35 = llvm.sub %20, %28 : i64 - %36 = llvm.select %34, %35, %28 : i1, i64 - %37 = llvm.sdiv %36, %19 : i64 - %38 = llvm.sub %20, %37 : i64 - %39 = llvm.select %34, %38, %37 : i1, i64 - %40 = llvm.mul %39, %19 : i64 - %41 = llvm.add %40, %33 : i64 - %42 = llvm.add %41, %17 : i64 - %43 = llvm.add %42, %17 : i64 - %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %45 = llvm.load %44 : !llvm.ptr -> f32 - %46 = llvm.fptrunc %45 : f32 to f16 - %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %46, %47 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown15(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(128 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %42 = llvm.mul %18, %36 : i64 + %43 = llvm.add %42, %18 : i64 + %44 = llvm.add %43, %18 : i64 + %45 = llvm.add %44, %18 : i64 + %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %47 = llvm.load %46 : !llvm.ptr -> f32 + %48 = llvm.fptrunc %47 : f32 to f16 + %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %48, %56 : f16, !llvm.ptr + %57 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%57 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown23(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3597,62 +1088,70 @@ module attributes {byre.container_module, gpu.container_module} { %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(200704 : index) : i64 - %28 = llvm.mlir.constant(56 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 + %25 = llvm.mlir.constant(100352 : index) : i64 + %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %27 = llvm.mlir.constant(0 : index) : i64 + %28 = nvvm.read.ptx.sreg.ctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = nvvm.read.ptx.sreg.ntid.x : i32 %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 + %32 = nvvm.read.ptx.sreg.tid.x : i32 %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(3136 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fadd %67, %69 : f16 - %71 = llvm.intr.maxnum(%70, %25) : (f16, f16) -> f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown13(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %34 = llvm.mul %31, %29 : i64 + %35 = llvm.add %33, %34 : i64 + %36 = nvvm.read.ptx.sreg.nctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = llvm.mul %31, %37 : i64 + llvm.br ^bb1(%35 : i64) + ^bb1(%39: i64): // 2 preds: ^bb0, ^bb2 + %40 = llvm.icmp "slt" %39, %25 : i64 + llvm.cond_br %40, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %43 = llvm.mlir.constant(1 : index) : i64 + %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.mlir.constant(784 : index) : i64 + %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.mlir.constant(28 : index) : i64 + %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %52 = llvm.mul %27, %25 : i64 + %53 = llvm.mul %27, %47 : i64 + %54 = llvm.add %52, %53 : i64 + %55 = llvm.mul %27, %50 : i64 + %56 = llvm.add %54, %55 : i64 + %57 = llvm.add %56, %27 : i64 + %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.load %58 : !llvm.ptr -> f16 + %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %68 = llvm.load %67 : !llvm.ptr -> f16 + %69 = llvm.fadd %59, %68 : f16 + %70 = llvm.intr.maximum(%69, %26) : (f16, f16) -> f16 + %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %70, %78 : f16, !llvm.ptr + %79 = llvm.add %39, %38 : i64 + llvm.br ^bb1(%79 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown21(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3671,69 +1170,59 @@ module attributes {byre.container_module, gpu.container_module} { %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(36864 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(64 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(576 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown12(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %18 = llvm.mlir.constant(147456 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(1152 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %51 = llvm.load %50 : !llvm.ptr -> f32 + %52 = llvm.fptrunc %51 : f32 to f16 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown20(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3751,59 +1240,60 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %18 = llvm.mlir.constant(0 : index) : i64 - %19 = llvm.mlir.constant(200704 : index) : i64 - %20 = llvm.mlir.constant(56 : index) : i64 - %21 = llvm.mlir.constant(-1 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 + %17 = llvm.mlir.constant(100352 : index) : i64 + %18 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %19 = llvm.mlir.constant(0 : index) : i64 + %20 = nvvm.read.ptx.sreg.ctaid.x : i32 + %21 = llvm.sext %20 : i32 to i64 + %22 = nvvm.read.ptx.sreg.ntid.x : i32 %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 + %24 = nvvm.read.ptx.sreg.tid.x : i32 %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %19 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %20 : i64 - %32 = llvm.icmp "slt" %31, %18 : i64 - %33 = llvm.add %31, %20 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %18 : i64 - %36 = llvm.sub %21, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %20 : i64 - %39 = llvm.sub %21, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %20 : i64 - %42 = llvm.icmp "slt" %41, %18 : i64 - %43 = llvm.add %41, %20 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %18 : i64 - %46 = llvm.sub %21, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %20 : i64 - %49 = llvm.sub %21, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.mul %18, %19 : i64 - %52 = llvm.mlir.constant(3136 : index) : i64 - %53 = llvm.mul %50, %52 : i64 - %54 = llvm.add %51, %53 : i64 - %55 = llvm.mul %44, %20 : i64 - %56 = llvm.add %54, %55 : i64 - %57 = llvm.add %56, %34 : i64 - %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %59 = llvm.load %58 : !llvm.ptr -> f16 - %60 = llvm.intr.maxnum(%59, %17) : (f16, f16) -> f16 - %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %60, %61 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown10(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %26 = llvm.mul %23, %21 : i64 + %27 = llvm.add %25, %26 : i64 + %28 = nvvm.read.ptx.sreg.nctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = llvm.mul %23, %29 : i64 + llvm.br ^bb1(%27 : i64) + ^bb1(%31: i64): // 2 preds: ^bb0, ^bb2 + %32 = llvm.icmp "slt" %31, %17 : i64 + llvm.cond_br %32, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %33 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %34 = llvm.insertvalue %31, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %35 = llvm.mlir.constant(1 : index) : i64 + %36 = llvm.insertvalue %35, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %37 = llvm.insertvalue %17, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %35, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(784 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %35, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(28 : index) : i64 + %43 = llvm.getelementptr %arg1[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %19, %17 : i64 + %45 = llvm.mul %19, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %19, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %19 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.intr.maximum(%51, %18) : (f16, f16) -> f16 + %53 = llvm.insertvalue %31, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %35, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %17, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %35, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %35, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %31, %30 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown18(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3822,69 +1312,59 @@ module attributes {byre.container_module, gpu.container_module} { %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(36864 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(64 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(576 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown9(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { + %18 = llvm.mlir.constant(73728 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(576 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %51 = llvm.load %50 : !llvm.ptr -> f32 + %52 = llvm.fptrunc %51 : f32 to f16 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown16(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3902,70 +1382,56 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %26 = llvm.mlir.constant(0 : index) : i64 - %27 = llvm.mlir.constant(200704 : index) : i64 - %28 = llvm.mlir.constant(56 : index) : i64 - %29 = llvm.mlir.constant(-1 : index) : i64 - %30 = nvvm.read.ptx.sreg.ctaid.x : i32 - %31 = llvm.sext %30 : i32 to i64 - %32 = nvvm.read.ptx.sreg.ntid.x : i32 - %33 = llvm.sext %32 : i32 to i64 - %34 = nvvm.read.ptx.sreg.tid.x : i32 - %35 = llvm.sext %34 : i32 to i64 - %36 = llvm.mul %33, %31 : i64 - %37 = llvm.add %35, %36 : i64 - %38 = llvm.icmp "slt" %37, %27 : i64 - llvm.cond_br %38, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %39 = llvm.srem %37, %28 : i64 - %40 = llvm.icmp "slt" %39, %26 : i64 - %41 = llvm.add %39, %28 : i64 - %42 = llvm.select %40, %41, %39 : i1, i64 - %43 = llvm.icmp "slt" %37, %26 : i64 - %44 = llvm.sub %29, %37 : i64 - %45 = llvm.select %43, %44, %37 : i1, i64 - %46 = llvm.sdiv %45, %28 : i64 - %47 = llvm.sub %29, %46 : i64 - %48 = llvm.select %43, %47, %46 : i1, i64 - %49 = llvm.srem %48, %28 : i64 - %50 = llvm.icmp "slt" %49, %26 : i64 - %51 = llvm.add %49, %28 : i64 - %52 = llvm.select %50, %51, %49 : i1, i64 - %53 = llvm.icmp "slt" %48, %26 : i64 - %54 = llvm.sub %29, %48 : i64 - %55 = llvm.select %53, %54, %48 : i1, i64 - %56 = llvm.sdiv %55, %28 : i64 - %57 = llvm.sub %29, %56 : i64 - %58 = llvm.select %53, %57, %56 : i1, i64 - %59 = llvm.mul %26, %27 : i64 - %60 = llvm.mlir.constant(3136 : index) : i64 - %61 = llvm.mul %58, %60 : i64 - %62 = llvm.add %59, %61 : i64 - %63 = llvm.mul %52, %28 : i64 - %64 = llvm.add %62, %63 : i64 - %65 = llvm.add %64, %42 : i64 - %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %67 = llvm.load %66 : !llvm.ptr -> f16 - %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %69 = llvm.load %68 : !llvm.ptr -> f16 - %70 = llvm.fadd %67, %69 : f16 - %71 = llvm.intr.maxnum(%70, %25) : (f16, f16) -> f16 - %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown7(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %17 = llvm.mlir.constant(8192 : index) : i64 + %18 = llvm.mlir.constant(0 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %17 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(64 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %42 = llvm.mul %18, %36 : i64 + %43 = llvm.add %42, %18 : i64 + %44 = llvm.add %43, %18 : i64 + %45 = llvm.add %44, %18 : i64 + %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %47 = llvm.load %46 : !llvm.ptr -> f32 + %48 = llvm.fptrunc %47 : f32 to f16 + %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %48, %56 : f16, !llvm.ptr + %57 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%57 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown9(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -3983,70 +1449,78 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(36864 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(64 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(576 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown6(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %25 = llvm.mlir.constant(200704 : index) : i64 + %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %27 = llvm.mlir.constant(0 : index) : i64 + %28 = nvvm.read.ptx.sreg.ctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = nvvm.read.ptx.sreg.ntid.x : i32 + %31 = llvm.sext %30 : i32 to i64 + %32 = nvvm.read.ptx.sreg.tid.x : i32 + %33 = llvm.sext %32 : i32 to i64 + %34 = llvm.mul %31, %29 : i64 + %35 = llvm.add %33, %34 : i64 + %36 = nvvm.read.ptx.sreg.nctaid.x : i32 + %37 = llvm.sext %36 : i32 to i64 + %38 = llvm.mul %31, %37 : i64 + llvm.br ^bb1(%35 : i64) + ^bb1(%39: i64): // 2 preds: ^bb0, ^bb2 + %40 = llvm.icmp "slt" %39, %25 : i64 + llvm.cond_br %40, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %43 = llvm.mlir.constant(1 : index) : i64 + %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %47 = llvm.mlir.constant(3136 : index) : i64 + %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %50 = llvm.mlir.constant(56 : index) : i64 + %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %52 = llvm.mul %27, %25 : i64 + %53 = llvm.mul %27, %47 : i64 + %54 = llvm.add %52, %53 : i64 + %55 = llvm.mul %27, %50 : i64 + %56 = llvm.add %54, %55 : i64 + %57 = llvm.add %56, %27 : i64 + %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.load %58 : !llvm.ptr -> f16 + %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %68 = llvm.load %67 : !llvm.ptr -> f16 + %69 = llvm.fadd %59, %68 : f16 + %70 = llvm.intr.maximum(%69, %26) : (f16, f16) -> f16 + %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %70, %78 : f16, !llvm.ptr + %79 = llvm.add %39, %38 : i64 + llvm.br ^bb1(%79 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown6(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -4064,59 +1538,60 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %18 = llvm.mlir.constant(0 : index) : i64 - %19 = llvm.mlir.constant(200704 : index) : i64 - %20 = llvm.mlir.constant(56 : index) : i64 - %21 = llvm.mlir.constant(-1 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 + %17 = llvm.mlir.constant(200704 : index) : i64 + %18 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %19 = llvm.mlir.constant(0 : index) : i64 + %20 = nvvm.read.ptx.sreg.ctaid.x : i32 + %21 = llvm.sext %20 : i32 to i64 + %22 = nvvm.read.ptx.sreg.ntid.x : i32 %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 + %24 = nvvm.read.ptx.sreg.tid.x : i32 %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %19 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %20 : i64 - %32 = llvm.icmp "slt" %31, %18 : i64 - %33 = llvm.add %31, %20 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %18 : i64 - %36 = llvm.sub %21, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %20 : i64 - %39 = llvm.sub %21, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %20 : i64 - %42 = llvm.icmp "slt" %41, %18 : i64 - %43 = llvm.add %41, %20 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %18 : i64 - %46 = llvm.sub %21, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %20 : i64 - %49 = llvm.sub %21, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.mul %18, %19 : i64 - %52 = llvm.mlir.constant(3136 : index) : i64 - %53 = llvm.mul %50, %52 : i64 - %54 = llvm.add %51, %53 : i64 - %55 = llvm.mul %44, %20 : i64 - %56 = llvm.add %54, %55 : i64 - %57 = llvm.add %56, %34 : i64 - %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %59 = llvm.load %58 : !llvm.ptr -> f16 - %60 = llvm.intr.maxnum(%59, %17) : (f16, f16) -> f16 - %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %60, %61 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown4(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %26 = llvm.mul %23, %21 : i64 + %27 = llvm.add %25, %26 : i64 + %28 = nvvm.read.ptx.sreg.nctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = llvm.mul %23, %29 : i64 + llvm.br ^bb1(%27 : i64) + ^bb1(%31: i64): // 2 preds: ^bb0, ^bb2 + %32 = llvm.icmp "slt" %31, %17 : i64 + llvm.cond_br %32, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %33 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %34 = llvm.insertvalue %31, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %35 = llvm.mlir.constant(1 : index) : i64 + %36 = llvm.insertvalue %35, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %37 = llvm.insertvalue %17, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %35, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(3136 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %35, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(56 : index) : i64 + %43 = llvm.getelementptr %arg1[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %19, %17 : i64 + %45 = llvm.mul %19, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %19, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %19 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.intr.maximum(%51, %18) : (f16, f16) -> f16 + %53 = llvm.insertvalue %31, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %35, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %17, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %35, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %35, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %31, %30 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown4(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -4136,68 +1611,58 @@ module attributes {byre.container_module, gpu.container_module} { %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 %18 = llvm.mlir.constant(36864 : index) : i64 - %19 = llvm.mlir.constant(3 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(64 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(576 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(9 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown3(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(576 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(9 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(3 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %51 = llvm.load %50 : !llvm.ptr -> f32 + %52 = llvm.fptrunc %51 : f32 to f16 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown3(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -4215,59 +1680,60 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16 - %18 = llvm.mlir.constant(0 : index) : i64 - %19 = llvm.mlir.constant(802816 : index) : i64 - %20 = llvm.mlir.constant(112 : index) : i64 - %21 = llvm.mlir.constant(-1 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 + %17 = llvm.mlir.constant(802816 : index) : i64 + %18 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %19 = llvm.mlir.constant(0 : index) : i64 + %20 = nvvm.read.ptx.sreg.ctaid.x : i32 + %21 = llvm.sext %20 : i32 to i64 + %22 = nvvm.read.ptx.sreg.ntid.x : i32 %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 + %24 = nvvm.read.ptx.sreg.tid.x : i32 %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %19 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %20 : i64 - %32 = llvm.icmp "slt" %31, %18 : i64 - %33 = llvm.add %31, %20 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %18 : i64 - %36 = llvm.sub %21, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %20 : i64 - %39 = llvm.sub %21, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %20 : i64 - %42 = llvm.icmp "slt" %41, %18 : i64 - %43 = llvm.add %41, %20 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %18 : i64 - %46 = llvm.sub %21, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %20 : i64 - %49 = llvm.sub %21, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.mul %18, %19 : i64 - %52 = llvm.mlir.constant(12544 : index) : i64 - %53 = llvm.mul %50, %52 : i64 - %54 = llvm.add %51, %53 : i64 - %55 = llvm.mul %44, %20 : i64 - %56 = llvm.add %54, %55 : i64 - %57 = llvm.add %56, %34 : i64 - %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %59 = llvm.load %58 : !llvm.ptr -> f16 - %60 = llvm.intr.maxnum(%59, %17) : (f16, f16) -> f16 - %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %60, %61 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown1(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %26 = llvm.mul %23, %21 : i64 + %27 = llvm.add %25, %26 : i64 + %28 = nvvm.read.ptx.sreg.nctaid.x : i32 + %29 = llvm.sext %28 : i32 to i64 + %30 = llvm.mul %23, %29 : i64 + llvm.br ^bb1(%27 : i64) + ^bb1(%31: i64): // 2 preds: ^bb0, ^bb2 + %32 = llvm.icmp "slt" %31, %17 : i64 + llvm.cond_br %32, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %33 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %34 = llvm.insertvalue %31, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %35 = llvm.mlir.constant(1 : index) : i64 + %36 = llvm.insertvalue %35, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %37 = llvm.insertvalue %17, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %35, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(12544 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %35, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(112 : index) : i64 + %43 = llvm.getelementptr %arg1[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %44 = llvm.mul %19, %17 : i64 + %45 = llvm.mul %19, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %19, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %19 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %51 = llvm.load %50 : !llvm.ptr -> f16 + %52 = llvm.intr.maximum(%51, %18) : (f16, f16) -> f16 + %53 = llvm.insertvalue %31, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %35, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %17, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %35, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %35, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %31, %30 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown1(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -4287,68 +1753,58 @@ module attributes {byre.container_module, gpu.container_module} { %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %17 = llvm.mlir.constant(0 : index) : i64 %18 = llvm.mlir.constant(9408 : index) : i64 - %19 = llvm.mlir.constant(7 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = llvm.mlir.constant(3 : index) : i64 - %22 = nvvm.read.ptx.sreg.ctaid.x : i32 - %23 = llvm.sext %22 : i32 to i64 - %24 = nvvm.read.ptx.sreg.ntid.x : i32 - %25 = llvm.sext %24 : i32 to i64 - %26 = nvvm.read.ptx.sreg.tid.x : i32 - %27 = llvm.sext %26 : i32 to i64 - %28 = llvm.mul %25, %23 : i64 - %29 = llvm.add %27, %28 : i64 - %30 = llvm.icmp "slt" %29, %18 : i64 - llvm.cond_br %30, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %31 = llvm.srem %29, %19 : i64 - %32 = llvm.icmp "slt" %31, %17 : i64 - %33 = llvm.add %31, %19 : i64 - %34 = llvm.select %32, %33, %31 : i1, i64 - %35 = llvm.icmp "slt" %29, %17 : i64 - %36 = llvm.sub %20, %29 : i64 - %37 = llvm.select %35, %36, %29 : i1, i64 - %38 = llvm.sdiv %37, %19 : i64 - %39 = llvm.sub %20, %38 : i64 - %40 = llvm.select %35, %39, %38 : i1, i64 - %41 = llvm.srem %40, %19 : i64 - %42 = llvm.icmp "slt" %41, %17 : i64 - %43 = llvm.add %41, %19 : i64 - %44 = llvm.select %42, %43, %41 : i1, i64 - %45 = llvm.icmp "slt" %40, %17 : i64 - %46 = llvm.sub %20, %40 : i64 - %47 = llvm.select %45, %46, %40 : i1, i64 - %48 = llvm.sdiv %47, %19 : i64 - %49 = llvm.sub %20, %48 : i64 - %50 = llvm.select %45, %49, %48 : i1, i64 - %51 = llvm.srem %50, %21 : i64 - %52 = llvm.icmp "slt" %51, %17 : i64 - %53 = llvm.add %51, %21 : i64 - %54 = llvm.select %52, %53, %51 : i1, i64 - %55 = llvm.icmp "slt" %50, %17 : i64 - %56 = llvm.sub %20, %50 : i64 - %57 = llvm.select %55, %56, %50 : i1, i64 - %58 = llvm.sdiv %57, %21 : i64 - %59 = llvm.sub %20, %58 : i64 - %60 = llvm.select %55, %59, %58 : i1, i64 - %61 = llvm.mlir.constant(147 : index) : i64 - %62 = llvm.mul %60, %61 : i64 - %63 = llvm.mlir.constant(49 : index) : i64 - %64 = llvm.mul %54, %63 : i64 - %65 = llvm.add %62, %64 : i64 - %66 = llvm.mul %44, %19 : i64 - %67 = llvm.add %65, %66 : i64 - %68 = llvm.add %67, %34 : i64 - %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %70 = llvm.load %69 : !llvm.ptr -> f32 - %71 = llvm.fptrunc %70 : f32 to f16 - %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %71, %72 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 - llvm.return - } - llvm.func @Unknown0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 + %22 = llvm.sext %21 : i32 to i64 + %23 = nvvm.read.ptx.sreg.tid.x : i32 + %24 = llvm.sext %23 : i32 to i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 + %31 = llvm.icmp "slt" %30, %18 : i64 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.mlir.constant(147 : index) : i64 + %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %39 = llvm.mlir.constant(49 : index) : i64 + %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %42 = llvm.mlir.constant(7 : index) : i64 + %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %44 = llvm.mul %17, %36 : i64 + %45 = llvm.mul %17, %39 : i64 + %46 = llvm.add %44, %45 : i64 + %47 = llvm.mul %17, %42 : i64 + %48 = llvm.add %46, %47 : i64 + %49 = llvm.add %48, %17 : i64 + %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %51 = llvm.load %50 : !llvm.ptr -> f32 + %52 = llvm.fptrunc %51 : f32 to f16 + %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %52, %60 : f16, !llvm.ptr + %61 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%61 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.func @Unknown0(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} { %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> @@ -4366,55 +1822,238 @@ module attributes {byre.container_module, gpu.container_module} { %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> - %17 = llvm.mlir.constant(0 : index) : i64 - %18 = llvm.mlir.constant(150528 : index) : i64 - %19 = llvm.mlir.constant(224 : index) : i64 - %20 = llvm.mlir.constant(-1 : index) : i64 - %21 = nvvm.read.ptx.sreg.ctaid.x : i32 + %17 = llvm.mlir.constant(150528 : index) : i64 + %18 = llvm.mlir.constant(0 : index) : i64 + %19 = nvvm.read.ptx.sreg.ctaid.x : i32 + %20 = llvm.sext %19 : i32 to i64 + %21 = nvvm.read.ptx.sreg.ntid.x : i32 %22 = llvm.sext %21 : i32 to i64 - %23 = nvvm.read.ptx.sreg.ntid.x : i32 + %23 = nvvm.read.ptx.sreg.tid.x : i32 %24 = llvm.sext %23 : i32 to i64 - %25 = nvvm.read.ptx.sreg.tid.x : i32 - %26 = llvm.sext %25 : i32 to i64 - %27 = llvm.mul %24, %22 : i64 - %28 = llvm.add %26, %27 : i64 - %29 = llvm.icmp "slt" %28, %18 : i64 - llvm.cond_br %29, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %30 = llvm.srem %28, %19 : i64 + %25 = llvm.mul %22, %20 : i64 + %26 = llvm.add %24, %25 : i64 + %27 = nvvm.read.ptx.sreg.nctaid.x : i32 + %28 = llvm.sext %27 : i32 to i64 + %29 = llvm.mul %22, %28 : i64 + llvm.br ^bb1(%26 : i64) + ^bb1(%30: i64): // 2 preds: ^bb0, ^bb2 %31 = llvm.icmp "slt" %30, %17 : i64 - %32 = llvm.add %30, %19 : i64 - %33 = llvm.select %31, %32, %30 : i1, i64 - %34 = llvm.icmp "slt" %28, %17 : i64 - %35 = llvm.sub %20, %28 : i64 - %36 = llvm.select %34, %35, %28 : i1, i64 - %37 = llvm.sdiv %36, %19 : i64 - %38 = llvm.sub %20, %37 : i64 - %39 = llvm.select %34, %38, %37 : i1, i64 - %40 = llvm.srem %39, %19 : i64 - %41 = llvm.icmp "slt" %40, %17 : i64 - %42 = llvm.add %40, %19 : i64 - %43 = llvm.select %41, %42, %40 : i1, i64 - %44 = llvm.icmp "slt" %39, %17 : i64 - %45 = llvm.sub %20, %39 : i64 - %46 = llvm.select %44, %45, %39 : i1, i64 - %47 = llvm.sdiv %46, %19 : i64 - %48 = llvm.sub %20, %47 : i64 - %49 = llvm.select %44, %48, %47 : i1, i64 - %50 = llvm.mul %17, %18 : i64 - %51 = llvm.mlir.constant(50176 : index) : i64 - %52 = llvm.mul %49, %51 : i64 - %53 = llvm.add %50, %52 : i64 - %54 = llvm.mul %43, %19 : i64 - %55 = llvm.add %53, %54 : i64 - %56 = llvm.add %55, %33 : i64 - %57 = llvm.getelementptr %arg1[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %58 = llvm.load %57 : !llvm.ptr -> f32 - %59 = llvm.fptrunc %58 : f32 to f16 - %60 = llvm.getelementptr %arg12[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %59, %60 : f16, !llvm.ptr - llvm.br ^bb2 - ^bb2: // 2 preds: ^bb0, ^bb1 + llvm.cond_br %31, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %34 = llvm.mlir.constant(1 : index) : i64 + %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %36 = llvm.insertvalue %17, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %37 = llvm.insertvalue %34, %36[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %38 = llvm.mlir.constant(50176 : index) : i64 + %39 = llvm.insertvalue %38, %37[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %41 = llvm.mlir.constant(224 : index) : i64 + %42 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %43 = llvm.mul %18, %17 : i64 + %44 = llvm.mul %18, %38 : i64 + %45 = llvm.add %43, %44 : i64 + %46 = llvm.mul %18, %41 : i64 + %47 = llvm.add %45, %46 : i64 + %48 = llvm.add %47, %18 : i64 + %49 = llvm.getelementptr %42[%48] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %50 = llvm.load %49 : !llvm.ptr -> f32 + %51 = llvm.fptrunc %50 : f32 to f16 + %52 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %53 = llvm.insertvalue %34, %52[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %54 = llvm.insertvalue %17, %53[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %55 = llvm.insertvalue %34, %54[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %56 = llvm.insertvalue %38, %55[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %57 = llvm.insertvalue %34, %56[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %58 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %59 = llvm.getelementptr %58[%48] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %51, %59 : f16, !llvm.ptr + %60 = llvm.add %30, %29 : i64 + llvm.br ^bb1(%60 : i64) + ^bb3: // pred: ^bb1 + llvm.return + } + llvm.mlir.global internal @__wg_Unknown58_kernel_0() {addr_space = 3 : i32} : !llvm.array<64 x f16> + llvm.mlir.global internal @__wg_Unknown58_kernel_1() {addr_space = 3 : i32} : !llvm.array<32 x f16> + llvm.mlir.global internal @__wg_Unknown58_kernel_2() {addr_space = 3 : i32} : !llvm.array<16 x f16> + llvm.mlir.global internal @__wg_Unknown58_kernel_3() {addr_space = 3 : i32} : !llvm.array<8 x f16> + llvm.mlir.global internal @__wg_Unknown58_kernel_4() {addr_space = 3 : i32} : !llvm.array<4 x f16> + llvm.mlir.global internal @__wg_Unknown58_kernel_5() {addr_space = 3 : i32} : !llvm.array<2 x f16> + llvm.func @Unknown58_kernel(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64) attributes {gpu.kernel, gpu.known_block_size = array, gpu.known_grid_size = array, nvvm.kernel} { + %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %5 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %6 = llvm.insertvalue %arg7, %5[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %7 = llvm.insertvalue %arg8, %6[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %8 = llvm.mlir.addressof @__wg_Unknown58_kernel_0 : !llvm.ptr<3> + %9 = llvm.getelementptr %8[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<64 x f16> + %10 = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %11 = llvm.insertvalue %9, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %12 = llvm.insertvalue %9, %11[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %13 = llvm.mlir.constant(0 : index) : i64 + %14 = llvm.mlir.constant(64 : index) : i64 + %15 = llvm.mlir.constant(1 : index) : i64 + %16 = llvm.mlir.addressof @__wg_Unknown58_kernel_1 : !llvm.ptr<3> + %17 = llvm.getelementptr %16[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<32 x f16> + %18 = llvm.insertvalue %17, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %19 = llvm.insertvalue %17, %18[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %20 = llvm.mlir.constant(32 : index) : i64 + %21 = llvm.mlir.addressof @__wg_Unknown58_kernel_2 : !llvm.ptr<3> + %22 = llvm.getelementptr %21[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<16 x f16> + %23 = llvm.insertvalue %22, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %24 = llvm.insertvalue %22, %23[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %25 = llvm.mlir.constant(16 : index) : i64 + %26 = llvm.mlir.addressof @__wg_Unknown58_kernel_3 : !llvm.ptr<3> + %27 = llvm.getelementptr %26[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<8 x f16> + %28 = llvm.insertvalue %27, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %29 = llvm.insertvalue %27, %28[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %30 = llvm.mlir.constant(8 : index) : i64 + %31 = llvm.mlir.addressof @__wg_Unknown58_kernel_4 : !llvm.ptr<3> + %32 = llvm.getelementptr %31[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<4 x f16> + %33 = llvm.insertvalue %32, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %34 = llvm.insertvalue %32, %33[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %35 = llvm.mlir.constant(4 : index) : i64 + %36 = llvm.mlir.addressof @__wg_Unknown58_kernel_5 : !llvm.ptr<3> + %37 = llvm.getelementptr %36[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<2 x f16> + %38 = llvm.insertvalue %37, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %39 = llvm.insertvalue %37, %38[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + %40 = llvm.mlir.constant(2 : index) : i64 + %41 = llvm.mlir.constant(0.000000e+00 : f16) : f16 + %42 = llvm.mlir.constant(49 : index) : i64 + %43 = nvvm.read.ptx.sreg.ctaid.x : i32 + %44 = llvm.sext %43 : i32 to i64 + %45 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)> + %46 = llvm.mul %44, %42 : i64 + %47 = nvvm.read.ptx.sreg.tid.x : i32 + %48 = llvm.sext %47 : i32 to i64 + %49 = llvm.srem %48, %14 : i64 + %50 = llvm.icmp "slt" %49, %13 : i64 + %51 = llvm.add %49, %14 : i64 + %52 = llvm.select %50, %51, %49 : i1, i64 + %53 = llvm.icmp "slt" %52, %42 : i64 + %54 = llvm.select %53, %52, %42 : i1, i64 + %55 = llvm.add %52, %15 : i64 + %56 = llvm.icmp "slt" %55, %42 : i64 + %57 = llvm.select %56, %55, %42 : i1, i64 + %58 = llvm.sub %57, %54 : i64 + %59 = llvm.add %46, %54 : i64 + %60 = llvm.insertvalue %59, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %61 = llvm.insertvalue %15, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %62 = llvm.icmp "ugt" %58, %13 : i64 + llvm.cond_br %62, ^bb1, ^bb2(%41 : f16) + ^bb1: // pred: ^bb0 + %63 = llvm.getelementptr %arg1[%59] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %64 = llvm.mul %58, %13 : i64 + %65 = llvm.add %64, %13 : i64 + %66 = llvm.getelementptr %63[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + %67 = llvm.load %66 : !llvm.ptr -> f16 + llvm.br ^bb2(%67 : f16) + ^bb2(%68: f16): // 2 preds: ^bb0, ^bb1 + %69 = llvm.fadd %68, %41 : f16 + %70 = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64)> + %71 = llvm.getelementptr %9[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + llvm.store %69, %71 : f16, !llvm.ptr<3> + nvvm.barrier0 + %72 = llvm.icmp "ult" %48, %20 : i64 + llvm.cond_br %72, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %73 = llvm.mul %48, %40 : i64 + %74 = llvm.getelementptr %9[%73] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %75 = llvm.load %74 : !llvm.ptr<3> -> f16 + %76 = llvm.fadd %75, %41 : f16 + %77 = llvm.add %73, %15 : i64 + %78 = llvm.getelementptr %9[%77] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %79 = llvm.load %78 : !llvm.ptr<3> -> f16 + %80 = llvm.fadd %79, %76 : f16 + %81 = llvm.getelementptr %17[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + llvm.store %80, %81 : f16, !llvm.ptr<3> + llvm.br ^bb4 + ^bb4: // 2 preds: ^bb2, ^bb3 + nvvm.barrier0 + %82 = llvm.icmp "ult" %48, %25 : i64 + llvm.cond_br %82, ^bb5, ^bb6 + ^bb5: // pred: ^bb4 + %83 = llvm.mul %48, %40 : i64 + %84 = llvm.getelementptr %17[%83] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %85 = llvm.load %84 : !llvm.ptr<3> -> f16 + %86 = llvm.fadd %85, %41 : f16 + %87 = llvm.add %83, %15 : i64 + %88 = llvm.getelementptr %17[%87] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %89 = llvm.load %88 : !llvm.ptr<3> -> f16 + %90 = llvm.fadd %89, %86 : f16 + %91 = llvm.getelementptr %22[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + llvm.store %90, %91 : f16, !llvm.ptr<3> + llvm.br ^bb6 + ^bb6: // 2 preds: ^bb4, ^bb5 + nvvm.barrier0 + %92 = llvm.icmp "ult" %48, %30 : i64 + llvm.cond_br %92, ^bb7, ^bb8 + ^bb7: // pred: ^bb6 + %93 = llvm.mul %48, %40 : i64 + %94 = llvm.getelementptr %22[%93] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %95 = llvm.load %94 : !llvm.ptr<3> -> f16 + %96 = llvm.fadd %95, %41 : f16 + %97 = llvm.add %93, %15 : i64 + %98 = llvm.getelementptr %22[%97] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %99 = llvm.load %98 : !llvm.ptr<3> -> f16 + %100 = llvm.fadd %99, %96 : f16 + %101 = llvm.getelementptr %27[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + llvm.store %100, %101 : f16, !llvm.ptr<3> + llvm.br ^bb8 + ^bb8: // 2 preds: ^bb6, ^bb7 + nvvm.barrier0 + %102 = llvm.icmp "ult" %48, %35 : i64 + llvm.cond_br %102, ^bb9, ^bb10 + ^bb9: // pred: ^bb8 + %103 = llvm.mul %48, %40 : i64 + %104 = llvm.getelementptr %27[%103] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %105 = llvm.load %104 : !llvm.ptr<3> -> f16 + %106 = llvm.fadd %105, %41 : f16 + %107 = llvm.add %103, %15 : i64 + %108 = llvm.getelementptr %27[%107] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %109 = llvm.load %108 : !llvm.ptr<3> -> f16 + %110 = llvm.fadd %109, %106 : f16 + %111 = llvm.getelementptr %32[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + llvm.store %110, %111 : f16, !llvm.ptr<3> + llvm.br ^bb10 + ^bb10: // 2 preds: ^bb8, ^bb9 + nvvm.barrier0 + %112 = llvm.icmp "ult" %48, %40 : i64 + llvm.cond_br %112, ^bb11, ^bb12 + ^bb11: // pred: ^bb10 + %113 = llvm.mul %48, %40 : i64 + %114 = llvm.getelementptr %32[%113] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %115 = llvm.load %114 : !llvm.ptr<3> -> f16 + %116 = llvm.fadd %115, %41 : f16 + %117 = llvm.add %113, %15 : i64 + %118 = llvm.getelementptr %32[%117] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %119 = llvm.load %118 : !llvm.ptr<3> -> f16 + %120 = llvm.fadd %119, %116 : f16 + %121 = llvm.getelementptr %37[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + llvm.store %120, %121 : f16, !llvm.ptr<3> + llvm.br ^bb12 + ^bb12: // 2 preds: ^bb10, ^bb11 + nvvm.barrier0 + %122 = llvm.icmp "ult" %48, %15 : i64 + llvm.cond_br %122, ^bb13, ^bb14 + ^bb13: // pred: ^bb12 + %123 = llvm.mul %48, %40 : i64 + %124 = llvm.getelementptr %37[%123] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %125 = llvm.load %124 : !llvm.ptr<3> -> f16 + %126 = llvm.fadd %125, %41 : f16 + %127 = llvm.add %123, %15 : i64 + %128 = llvm.getelementptr %37[%127] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 + %129 = llvm.load %128 : !llvm.ptr<3> -> f16 + %130 = llvm.fadd %129, %126 : f16 + %131 = llvm.getelementptr %arg8[%44] : (!llvm.ptr, i64) -> !llvm.ptr, f16 + llvm.store %130, %131 : f16, !llvm.ptr + llvm.br ^bb14 + ^bb14: // 2 preds: ^bb12, ^bb13 + nvvm.barrier0 llvm.return } } diff --git a/compiler/test/E2E/ResNet18/FW/2_linalg_tensor_opt.mlir b/compiler/test/E2E/ResNet18/FW/2_linalg_tensor_opt.mlir index 04d163e36..d4507bd46 100644 --- a/compiler/test/E2E/ResNet18/FW/2_linalg_tensor_opt.mlir +++ b/compiler/test/E2E/ResNet18/FW/2_linalg_tensor_opt.mlir @@ -37,53 +37,12 @@ module { %1 = mhlo.maximum %arg0, %0 : tensor<1x64x56x56xf16> return %1 : tensor<1x64x56x56xf16> } - func.func private @Unknown7(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - return %0 : tensor<64x64x3x3xf16> - } - func.func private @BatchNormTrainingOp8(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } func.func private @Unknown9(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<1x64x56x56xf16> %1 = mhlo.add %arg0, %arg1 : tensor<1x64x56x56xf16> %2 = mhlo.maximum %1, %0 : tensor<1x64x56x56xf16> return %2 : tensor<1x64x56x56xf16> } - func.func private @Unknown10(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - return %0 : tensor<64x64x3x3xf16> - } - func.func private @BatchNormTrainingOp11(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @Unknown12(%arg0: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x64x56x56xf16> - %1 = mhlo.maximum %arg0, %0 : tensor<1x64x56x56xf16> - return %1 : tensor<1x64x56x56xf16> - } - func.func private @Unknown13(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - return %0 : tensor<64x64x3x3xf16> - } - func.func private @BatchNormTrainingOp14(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @Unknown15(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x64x56x56xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<1x64x56x56xf16> - %2 = mhlo.maximum %1, %0 : tensor<1x64x56x56xf16> - return %2 : tensor<1x64x56x56xf16> - } func.func private @Unknown16(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> return %0 : tensor<128x64x1x1xf16> @@ -98,12 +57,6 @@ module { %0 = mhlo.convert %arg0 : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> return %0 : tensor<128x64x3x3xf16> } - func.func private @BatchNormTrainingOp19(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } func.func private @Unknown20(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16> %1 = mhlo.maximum %arg0, %0 : tensor<1x128x28x28xf16> @@ -113,49 +66,12 @@ module { %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> return %0 : tensor<128x128x3x3xf16> } - func.func private @BatchNormTrainingOp22(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } func.func private @Unknown23(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16> %1 = mhlo.add %arg0, %arg1 : tensor<1x128x28x28xf16> %2 = mhlo.maximum %1, %0 : tensor<1x128x28x28xf16> return %2 : tensor<1x128x28x28xf16> } - func.func private @Unknown24(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - return %0 : tensor<128x128x3x3xf16> - } - func.func private @BatchNormTrainingOp25(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @Unknown26(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16> - %1 = mhlo.maximum %arg0, %0 : tensor<1x128x28x28xf16> - return %1 : tensor<1x128x28x28xf16> - } - func.func private @Unknown27(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - return %0 : tensor<128x128x3x3xf16> - } - func.func private @BatchNormTrainingOp28(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @Unknown29(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<1x128x28x28xf16> - %2 = mhlo.maximum %1, %0 : tensor<1x128x28x28xf16> - return %2 : tensor<1x128x28x28xf16> - } func.func private @Unknown30(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> return %0 : tensor<256x128x1x1xf16> @@ -170,12 +86,6 @@ module { %0 = mhlo.convert %arg0 : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> return %0 : tensor<256x128x3x3xf16> } - func.func private @BatchNormTrainingOp33(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } func.func private @Unknown34(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16> %1 = mhlo.maximum %arg0, %0 : tensor<1x256x14x14xf16> @@ -185,49 +95,12 @@ module { %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> return %0 : tensor<256x256x3x3xf16> } - func.func private @BatchNormTrainingOp36(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } func.func private @Unknown37(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16> %1 = mhlo.add %arg0, %arg1 : tensor<1x256x14x14xf16> %2 = mhlo.maximum %1, %0 : tensor<1x256x14x14xf16> return %2 : tensor<1x256x14x14xf16> } - func.func private @Unknown38(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - return %0 : tensor<256x256x3x3xf16> - } - func.func private @BatchNormTrainingOp39(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @Unknown40(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16> - %1 = mhlo.maximum %arg0, %0 : tensor<1x256x14x14xf16> - return %1 : tensor<1x256x14x14xf16> - } - func.func private @Unknown41(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - return %0 : tensor<256x256x3x3xf16> - } - func.func private @BatchNormTrainingOp42(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @Unknown43(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<1x256x14x14xf16> - %2 = mhlo.maximum %1, %0 : tensor<1x256x14x14xf16> - return %2 : tensor<1x256x14x14xf16> - } func.func private @Unknown44(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> return %0 : tensor<512x256x1x1xf16> @@ -242,12 +115,6 @@ module { %0 = mhlo.convert %arg0 : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> return %0 : tensor<512x256x3x3xf16> } - func.func private @BatchNormTrainingOp47(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } func.func private @Unknown48(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16> %1 = mhlo.maximum %arg0, %0 : tensor<1x512x7x7xf16> @@ -257,72 +124,36 @@ module { %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> return %0 : tensor<512x512x3x3xf16> } - func.func private @BatchNormTrainingOp50(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } func.func private @Unknown51(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16> %1 = mhlo.add %arg0, %arg1 : tensor<1x512x7x7xf16> %2 = mhlo.maximum %1, %0 : tensor<1x512x7x7xf16> return %2 : tensor<1x512x7x7xf16> } - func.func private @Unknown52(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - return %0 : tensor<512x512x3x3xf16> - } - func.func private @BatchNormTrainingOp53(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @Unknown54(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16> - %1 = mhlo.maximum %arg0, %0 : tensor<1x512x7x7xf16> - return %1 : tensor<1x512x7x7xf16> - } - func.func private @Unknown55(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - return %0 : tensor<512x512x3x3xf16> - } - func.func private @BatchNormTrainingOp56(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @Unknown57(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<1x512x7x7xf16> - %2 = mhlo.maximum %1, %0 : tensor<1x512x7x7xf16> - return %2 : tensor<1x512x7x7xf16> + func.func private @Unknown58(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512xf16> attributes {__byteir_reduction_fusion__} { + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = mhlo.reduce(%arg0 init: %0) across dimensions = [3, 2] : (tensor<1x512x7x7xf16>, tensor) -> tensor<1x512xf16> + reducer(%arg1: tensor, %arg2: tensor) { + %2 = mhlo.add %arg1, %arg2 : tensor + mhlo.return %2 : tensor + } + return %1 : tensor<1x512xf16> } - func.func private @Unknown58(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown59(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<2.040100e-02> : tensor<1x512xf16> %1 = mhlo.multiply %arg0, %0 : tensor<1x512xf16> return %1 : tensor<1x512xf16> } - func.func private @Unknown59(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown60(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<1000x512xf32>) -> tensor<1000x512xf16> return %0 : tensor<1000x512xf16> } - func.func private @Unknown60(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<1000xf32>) -> tensor<1000xf16> %1 = mhlo.reshape %0 : (tensor<1000xf16>) -> tensor<1x1000xf16> %2 = mhlo.add %arg1, %1 : tensor<1x1000xf16> return %2 : tensor<1x1000xf16> } - func.func private @Unknown61(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<64xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<64xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<64xf32> - %4 = mhlo.add %2, %3 : tensor<64xf32> - return %4 : tensor<64xf32> - } func.func private @Unknown62(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.899999976> : tensor<64xf32> %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32> @@ -331,78 +162,6 @@ module { %4 = mhlo.add %2, %3 : tensor<64xf32> return %4 : tensor<64xf32> } - func.func private @Unknown63(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<64xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<64xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<64xf32> - %4 = mhlo.add %2, %3 : tensor<64xf32> - return %4 : tensor<64xf32> - } - func.func private @Unknown64(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<64xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<64xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<64xf32> - %4 = mhlo.add %2, %3 : tensor<64xf32> - return %4 : tensor<64xf32> - } - func.func private @Unknown65(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<64xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<64xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<64xf32> - %4 = mhlo.add %2, %3 : tensor<64xf32> - return %4 : tensor<64xf32> - } - func.func private @Unknown66(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<64xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<64xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<64xf32> - %4 = mhlo.add %2, %3 : tensor<64xf32> - return %4 : tensor<64xf32> - } - func.func private @Unknown67(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<64xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<64xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<64xf32> - %4 = mhlo.add %2, %3 : tensor<64xf32> - return %4 : tensor<64xf32> - } - func.func private @Unknown68(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<64xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<64xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<64xf32> - %4 = mhlo.add %2, %3 : tensor<64xf32> - return %4 : tensor<64xf32> - } - func.func private @Unknown69(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<64xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<64xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<64xf32> - %4 = mhlo.add %2, %3 : tensor<64xf32> - return %4 : tensor<64xf32> - } - func.func private @Unknown70(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<64xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<64xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<64xf32> - %4 = mhlo.add %2, %3 : tensor<64xf32> - return %4 : tensor<64xf32> - } - func.func private @Unknown71(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<128xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<128xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<128xf32> - %4 = mhlo.add %2, %3 : tensor<128xf32> - return %4 : tensor<128xf32> - } func.func private @Unknown72(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.899999976> : tensor<128xf32> %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32> @@ -411,78 +170,6 @@ module { %4 = mhlo.add %2, %3 : tensor<128xf32> return %4 : tensor<128xf32> } - func.func private @Unknown73(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<128xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<128xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<128xf32> - %4 = mhlo.add %2, %3 : tensor<128xf32> - return %4 : tensor<128xf32> - } - func.func private @Unknown74(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<128xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<128xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<128xf32> - %4 = mhlo.add %2, %3 : tensor<128xf32> - return %4 : tensor<128xf32> - } - func.func private @Unknown75(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<128xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<128xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<128xf32> - %4 = mhlo.add %2, %3 : tensor<128xf32> - return %4 : tensor<128xf32> - } - func.func private @Unknown76(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<128xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<128xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<128xf32> - %4 = mhlo.add %2, %3 : tensor<128xf32> - return %4 : tensor<128xf32> - } - func.func private @Unknown77(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<128xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<128xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<128xf32> - %4 = mhlo.add %2, %3 : tensor<128xf32> - return %4 : tensor<128xf32> - } - func.func private @Unknown78(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<128xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<128xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<128xf32> - %4 = mhlo.add %2, %3 : tensor<128xf32> - return %4 : tensor<128xf32> - } - func.func private @Unknown79(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<128xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<128xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<128xf32> - %4 = mhlo.add %2, %3 : tensor<128xf32> - return %4 : tensor<128xf32> - } - func.func private @Unknown80(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<128xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<128xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<128xf32> - %4 = mhlo.add %2, %3 : tensor<128xf32> - return %4 : tensor<128xf32> - } - func.func private @Unknown81(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<256xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<256xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<256xf32> - %4 = mhlo.add %2, %3 : tensor<256xf32> - return %4 : tensor<256xf32> - } func.func private @Unknown82(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.899999976> : tensor<256xf32> %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32> @@ -491,78 +178,6 @@ module { %4 = mhlo.add %2, %3 : tensor<256xf32> return %4 : tensor<256xf32> } - func.func private @Unknown83(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<256xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<256xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<256xf32> - %4 = mhlo.add %2, %3 : tensor<256xf32> - return %4 : tensor<256xf32> - } - func.func private @Unknown84(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<256xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<256xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<256xf32> - %4 = mhlo.add %2, %3 : tensor<256xf32> - return %4 : tensor<256xf32> - } - func.func private @Unknown85(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<256xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<256xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<256xf32> - %4 = mhlo.add %2, %3 : tensor<256xf32> - return %4 : tensor<256xf32> - } - func.func private @Unknown86(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<256xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<256xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<256xf32> - %4 = mhlo.add %2, %3 : tensor<256xf32> - return %4 : tensor<256xf32> - } - func.func private @Unknown87(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<256xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<256xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<256xf32> - %4 = mhlo.add %2, %3 : tensor<256xf32> - return %4 : tensor<256xf32> - } - func.func private @Unknown88(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<256xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<256xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<256xf32> - %4 = mhlo.add %2, %3 : tensor<256xf32> - return %4 : tensor<256xf32> - } - func.func private @Unknown89(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<256xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<256xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<256xf32> - %4 = mhlo.add %2, %3 : tensor<256xf32> - return %4 : tensor<256xf32> - } - func.func private @Unknown90(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<256xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<256xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<256xf32> - %4 = mhlo.add %2, %3 : tensor<256xf32> - return %4 : tensor<256xf32> - } - func.func private @Unknown91(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<512xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<512xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<512xf32> - %4 = mhlo.add %2, %3 : tensor<512xf32> - return %4 : tensor<512xf32> - } func.func private @Unknown92(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.899999976> : tensor<512xf32> %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32> @@ -571,206 +186,137 @@ module { %4 = mhlo.add %2, %3 : tensor<512xf32> return %4 : tensor<512xf32> } - func.func private @Unknown93(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<512xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<512xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<512xf32> - %4 = mhlo.add %2, %3 : tensor<512xf32> - return %4 : tensor<512xf32> - } - func.func private @Unknown94(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<512xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<512xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<512xf32> - %4 = mhlo.add %2, %3 : tensor<512xf32> - return %4 : tensor<512xf32> - } - func.func private @Unknown95(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<512xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<512xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<512xf32> - %4 = mhlo.add %2, %3 : tensor<512xf32> - return %4 : tensor<512xf32> - } - func.func private @Unknown96(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<512xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<512xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<512xf32> - %4 = mhlo.add %2, %3 : tensor<512xf32> - return %4 : tensor<512xf32> - } - func.func private @Unknown97(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<512xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<512xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<512xf32> - %4 = mhlo.add %2, %3 : tensor<512xf32> - return %4 : tensor<512xf32> - } - func.func private @Unknown98(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<512xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<512xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<512xf32> - %4 = mhlo.add %2, %3 : tensor<512xf32> - return %4 : tensor<512xf32> - } - func.func private @Unknown99(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<512xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<512xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<512xf32> - %4 = mhlo.add %2, %3 : tensor<512xf32> - return %4 : tensor<512xf32> - } - func.func private @Unknown100(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.899999976> : tensor<512xf32> - %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32> - %2 = mhlo.multiply %arg0, %1 : tensor<512xf32> - %3 = mhlo.multiply %arg1, %0 : tensor<512xf32> - %4 = mhlo.add %2, %3 : tensor<512xf32> - return %4 : tensor<512xf32> - } func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<1000xf32>, %arg4: tensor<1000x512xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64x64x3x3xf32>, %arg10: tensor<64x64x3x3xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64x64x3x3xf32>, %arg16: tensor<64x64x3x3xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<128xf32>, %arg21: tensor<128x64x3x3xf32>, %arg22: tensor<128x128x3x3xf32>, %arg23: tensor<128x64x1x1xf32>, %arg24: tensor<128xf32>, %arg25: tensor<128xf32>, %arg26: tensor<128xf32>, %arg27: tensor<128xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128x128x3x3xf32>, %arg31: tensor<128x128x3x3xf32>, %arg32: tensor<256xf32>, %arg33: tensor<256xf32>, %arg34: tensor<256xf32>, %arg35: tensor<256xf32>, %arg36: tensor<256x128x3x3xf32>, %arg37: tensor<256x256x3x3xf32>, %arg38: tensor<256x128x1x1xf32>, %arg39: tensor<256xf32>, %arg40: tensor<256xf32>, %arg41: tensor<256xf32>, %arg42: tensor<256xf32>, %arg43: tensor<256xf32>, %arg44: tensor<256xf32>, %arg45: tensor<256x256x3x3xf32>, %arg46: tensor<256x256x3x3xf32>, %arg47: tensor<512xf32>, %arg48: tensor<512xf32>, %arg49: tensor<512xf32>, %arg50: tensor<512xf32>, %arg51: tensor<512x256x3x3xf32>, %arg52: tensor<512x512x3x3xf32>, %arg53: tensor<512x256x1x1xf32>, %arg54: tensor<512xf32>, %arg55: tensor<512xf32>, %arg56: tensor<512xf32>, %arg57: tensor<512xf32>, %arg58: tensor<512xf32>, %arg59: tensor<512xf32>, %arg60: tensor<512x512x3x3xf32>, %arg61: tensor<512x512x3x3xf32>, %arg62: tensor, %arg63: tensor<64xf32>, %arg64: tensor<64xf32>, %arg65: tensor, %arg66: tensor<64xf32>, %arg67: tensor<64xf32>, %arg68: tensor, %arg69: tensor<64xf32>, %arg70: tensor<64xf32>, %arg71: tensor, %arg72: tensor<64xf32>, %arg73: tensor<64xf32>, %arg74: tensor, %arg75: tensor<64xf32>, %arg76: tensor<64xf32>, %arg77: tensor, %arg78: tensor<128xf32>, %arg79: tensor<128xf32>, %arg80: tensor, %arg81: tensor<128xf32>, %arg82: tensor<128xf32>, %arg83: tensor, %arg84: tensor<128xf32>, %arg85: tensor<128xf32>, %arg86: tensor, %arg87: tensor<128xf32>, %arg88: tensor<128xf32>, %arg89: tensor, %arg90: tensor<128xf32>, %arg91: tensor<128xf32>, %arg92: tensor, %arg93: tensor<256xf32>, %arg94: tensor<256xf32>, %arg95: tensor, %arg96: tensor<256xf32>, %arg97: tensor<256xf32>, %arg98: tensor, %arg99: tensor<256xf32>, %arg100: tensor<256xf32>, %arg101: tensor, %arg102: tensor<256xf32>, %arg103: tensor<256xf32>, %arg104: tensor, %arg105: tensor<256xf32>, %arg106: tensor<256xf32>, %arg107: tensor, %arg108: tensor<512xf32>, %arg109: tensor<512xf32>, %arg110: tensor, %arg111: tensor<512xf32>, %arg112: tensor<512xf32>, %arg113: tensor, %arg114: tensor<512xf32>, %arg115: tensor<512xf32>, %arg116: tensor, %arg117: tensor<512xf32>, %arg118: tensor<512xf32>, %arg119: tensor, %arg120: tensor<512xf32>, %arg121: tensor<512xf32>, %arg122: tensor<1x3x224x224xf32>) -> (tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>) { - %0 = mhlo.constant dense<0.000000e+00> : tensor - %1 = mhlo.constant dense<0xFC00> : tensor - %2 = call @Unknown0(%arg122) : (tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16> - %3 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> - %4 = mhlo.convolution(%2, %3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<1x64x112x112xf16> - %5:3 = call @BatchNormTrainingOp2(%4, %arg1, %arg0) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) - %6 = call @Unknown3(%5#0) : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> - %7 = "mhlo.reduce_window"(%6, %1) ({ + %0 = mhlo.constant dense<0xFC00> : tensor + %1 = call @Unknown0(%arg122) : (tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16> + %2 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> + %3 = mhlo.convolution(%1, %2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<1x64x112x112xf16> + %4:3 = call @BatchNormTrainingOp2(%3, %arg1, %arg0) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) + %5 = call @Unknown3(%4#0) : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> + %6 = "mhlo.reduce_window"(%5, %0) ({ ^bb0(%arg123: tensor, %arg124: tensor): - %127 = mhlo.maximum %arg123, %arg124 : tensor - mhlo.return %127 : tensor + %126 = mhlo.maximum %arg123, %arg124 : tensor + mhlo.return %126 : tensor }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<1x64x112x112xf16>, tensor) -> tensor<1x64x56x56xf16> - %8 = call @Unknown4(%arg9) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %9 = mhlo.convolution(%7, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %10:3 = call @BatchNormTrainingOp5(%9, %arg6, %arg5) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %11 = call @Unknown6(%10#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %12 = call @Unknown7(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %13 = mhlo.convolution(%11, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %14:3 = call @BatchNormTrainingOp8(%13, %arg8, %arg7) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %15 = call @Unknown9(%14#0, %7) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %16 = call @Unknown10(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %17 = mhlo.convolution(%15, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %18:3 = call @BatchNormTrainingOp11(%17, %arg12, %arg11) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %19 = call @Unknown12(%18#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %20 = call @Unknown13(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %21 = mhlo.convolution(%19, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %22:3 = call @BatchNormTrainingOp14(%21, %arg14, %arg13) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %23 = call @Unknown15(%22#0, %15) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %24 = call @Unknown16(%arg23) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> - %25 = mhlo.convolution(%23, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<1x128x28x28xf16> - %26:3 = call @BatchNormTrainingOp17(%25, %arg25, %arg24) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %27 = call @Unknown18(%arg21) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> - %28 = mhlo.convolution(%23, %27) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<1x128x28x28xf16> - %29:3 = call @BatchNormTrainingOp19(%28, %arg18, %arg17) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %30 = call @Unknown20(%29#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %31 = call @Unknown21(%arg22) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %32 = mhlo.convolution(%30, %31) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %33:3 = call @BatchNormTrainingOp22(%32, %arg20, %arg19) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %34 = call @Unknown23(%33#0, %26#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %35 = call @Unknown24(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %36 = mhlo.convolution(%34, %35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %37:3 = call @BatchNormTrainingOp25(%36, %arg27, %arg26) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %38 = call @Unknown26(%37#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %39 = call @Unknown27(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %40 = mhlo.convolution(%38, %39) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %41:3 = call @BatchNormTrainingOp28(%40, %arg29, %arg28) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %42 = call @Unknown29(%41#0, %34) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %43 = call @Unknown30(%arg38) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> - %44 = mhlo.convolution(%42, %43) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<1x256x14x14xf16> - %45:3 = call @BatchNormTrainingOp31(%44, %arg40, %arg39) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %46 = call @Unknown32(%arg36) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> - %47 = mhlo.convolution(%42, %46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<1x256x14x14xf16> - %48:3 = call @BatchNormTrainingOp33(%47, %arg33, %arg32) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %49 = call @Unknown34(%48#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %50 = call @Unknown35(%arg37) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %51 = mhlo.convolution(%49, %50) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %52:3 = call @BatchNormTrainingOp36(%51, %arg35, %arg34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %53 = call @Unknown37(%52#0, %45#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %54 = call @Unknown38(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %55 = mhlo.convolution(%53, %54) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %56:3 = call @BatchNormTrainingOp39(%55, %arg42, %arg41) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %57 = call @Unknown40(%56#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %58 = call @Unknown41(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %59 = mhlo.convolution(%57, %58) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %60:3 = call @BatchNormTrainingOp42(%59, %arg44, %arg43) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %61 = call @Unknown43(%60#0, %53) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %62 = call @Unknown44(%arg53) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> - %63 = mhlo.convolution(%61, %62) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<1x512x7x7xf16> - %64:3 = call @BatchNormTrainingOp45(%63, %arg55, %arg54) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %65 = call @Unknown46(%arg51) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> - %66 = mhlo.convolution(%61, %65) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<1x512x7x7xf16> - %67:3 = call @BatchNormTrainingOp47(%66, %arg48, %arg47) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %68 = call @Unknown48(%67#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %69 = call @Unknown49(%arg52) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %70 = mhlo.convolution(%68, %69) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %71:3 = call @BatchNormTrainingOp50(%70, %arg50, %arg49) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %72 = call @Unknown51(%71#0, %64#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %73 = call @Unknown52(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %74 = mhlo.convolution(%72, %73) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %75:3 = call @BatchNormTrainingOp53(%74, %arg57, %arg56) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %76 = call @Unknown54(%75#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %77 = call @Unknown55(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %78 = mhlo.convolution(%76, %77) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %79:3 = call @BatchNormTrainingOp56(%78, %arg59, %arg58) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %80 = call @Unknown57(%79#0, %72) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %81 = mhlo.reduce(%80 init: %0) across dimensions = [3, 2] : (tensor<1x512x7x7xf16>, tensor) -> tensor<1x512xf16> - reducer(%arg123: tensor, %arg124: tensor) { - %127 = mhlo.add %arg123, %arg124 : tensor - mhlo.return %127 : tensor - } - %82 = call @Unknown58(%81) : (tensor<1x512xf16>) -> tensor<1x512xf16> - %83 = call @Unknown59(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> - %84 = "mhlo.transpose"(%83) {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1000x512xf16>) -> tensor<512x1000xf16> - %85 = "mhlo.dot_general"(%82, %83) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<1x512xf16>, tensor<1000x512xf16>) -> tensor<1x1000xf16> - %86 = call @Unknown60(%arg3, %85) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16> - %87 = call @Unknown61(%5#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %88 = call @Unknown62(%5#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %89 = call @Unknown63(%10#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %90 = call @Unknown64(%10#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %91 = call @Unknown65(%14#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %92 = call @Unknown66(%14#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %93 = call @Unknown67(%18#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %94 = call @Unknown68(%18#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %95 = call @Unknown69(%22#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %96 = call @Unknown70(%22#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %97 = call @Unknown71(%29#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %98 = call @Unknown72(%29#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %99 = call @Unknown73(%33#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %100 = call @Unknown74(%33#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %101 = call @Unknown75(%26#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %102 = call @Unknown76(%26#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %103 = call @Unknown77(%37#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %104 = call @Unknown78(%37#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %105 = call @Unknown79(%41#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %106 = call @Unknown80(%41#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %107 = call @Unknown81(%48#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %108 = call @Unknown82(%48#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %109 = call @Unknown83(%52#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %110 = call @Unknown84(%52#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %111 = call @Unknown85(%45#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %112 = call @Unknown86(%45#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %113 = call @Unknown87(%56#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %114 = call @Unknown88(%56#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %115 = call @Unknown89(%60#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %116 = call @Unknown90(%60#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %117 = call @Unknown91(%67#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %118 = call @Unknown92(%67#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %119 = call @Unknown93(%71#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %120 = call @Unknown94(%71#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %121 = call @Unknown95(%64#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %122 = call @Unknown96(%64#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %123 = call @Unknown97(%75#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %124 = call @Unknown98(%75#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %125 = call @Unknown99(%79#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %126 = call @Unknown100(%79#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - return %86, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %3, %2, %4, %6, %7, %8, %9, %11, %12, %13, %15, %16, %17, %19, %20, %21, %23, %27, %28, %30, %31, %32, %24, %25, %34, %35, %36, %38, %39, %40, %42, %46, %47, %49, %50, %51, %43, %44, %53, %54, %55, %57, %58, %59, %61, %65, %66, %68, %69, %70, %62, %63, %72, %73, %74, %76, %77, %78, %80, %82, %84 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16> + %7 = call @Unknown4(%arg9) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %8 = mhlo.convolution(%6, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %9:3 = call @BatchNormTrainingOp5(%8, %arg6, %arg5) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %10 = call @Unknown6(%9#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %11 = call @Unknown4(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %12 = mhlo.convolution(%10, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %13:3 = call @BatchNormTrainingOp5(%12, %arg8, %arg7) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %14 = call @Unknown9(%13#0, %6) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %15 = call @Unknown4(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %16 = mhlo.convolution(%14, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %17:3 = call @BatchNormTrainingOp5(%16, %arg12, %arg11) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %18 = call @Unknown6(%17#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %19 = call @Unknown4(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %20 = mhlo.convolution(%18, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %21:3 = call @BatchNormTrainingOp5(%20, %arg14, %arg13) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %22 = call @Unknown9(%21#0, %14) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %23 = call @Unknown16(%arg23) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> + %24 = mhlo.convolution(%22, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<1x128x28x28xf16> + %25:3 = call @BatchNormTrainingOp17(%24, %arg25, %arg24) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %26 = call @Unknown18(%arg21) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> + %27 = mhlo.convolution(%22, %26) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<1x128x28x28xf16> + %28:3 = call @BatchNormTrainingOp17(%27, %arg18, %arg17) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %29 = call @Unknown20(%28#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %30 = call @Unknown21(%arg22) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %31 = mhlo.convolution(%29, %30) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %32:3 = call @BatchNormTrainingOp17(%31, %arg20, %arg19) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %33 = call @Unknown23(%32#0, %25#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %34 = call @Unknown21(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %35 = mhlo.convolution(%33, %34) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %36:3 = call @BatchNormTrainingOp17(%35, %arg27, %arg26) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %37 = call @Unknown20(%36#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %38 = call @Unknown21(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %39 = mhlo.convolution(%37, %38) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %40:3 = call @BatchNormTrainingOp17(%39, %arg29, %arg28) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %41 = call @Unknown23(%40#0, %33) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %42 = call @Unknown30(%arg38) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> + %43 = mhlo.convolution(%41, %42) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<1x256x14x14xf16> + %44:3 = call @BatchNormTrainingOp31(%43, %arg40, %arg39) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %45 = call @Unknown32(%arg36) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> + %46 = mhlo.convolution(%41, %45) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<1x256x14x14xf16> + %47:3 = call @BatchNormTrainingOp31(%46, %arg33, %arg32) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %48 = call @Unknown34(%47#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %49 = call @Unknown35(%arg37) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %50 = mhlo.convolution(%48, %49) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %51:3 = call @BatchNormTrainingOp31(%50, %arg35, %arg34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %52 = call @Unknown37(%51#0, %44#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %53 = call @Unknown35(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %54 = mhlo.convolution(%52, %53) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %55:3 = call @BatchNormTrainingOp31(%54, %arg42, %arg41) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %56 = call @Unknown34(%55#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %57 = call @Unknown35(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %58 = mhlo.convolution(%56, %57) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %59:3 = call @BatchNormTrainingOp31(%58, %arg44, %arg43) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %60 = call @Unknown37(%59#0, %52) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %61 = call @Unknown44(%arg53) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> + %62 = mhlo.convolution(%60, %61) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<1x512x7x7xf16> + %63:3 = call @BatchNormTrainingOp45(%62, %arg55, %arg54) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %64 = call @Unknown46(%arg51) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> + %65 = mhlo.convolution(%60, %64) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<1x512x7x7xf16> + %66:3 = call @BatchNormTrainingOp45(%65, %arg48, %arg47) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %67 = call @Unknown48(%66#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %68 = call @Unknown49(%arg52) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %69 = mhlo.convolution(%67, %68) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %70:3 = call @BatchNormTrainingOp45(%69, %arg50, %arg49) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %71 = call @Unknown51(%70#0, %63#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %72 = call @Unknown49(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %73 = mhlo.convolution(%71, %72) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %74:3 = call @BatchNormTrainingOp45(%73, %arg57, %arg56) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %75 = call @Unknown48(%74#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %76 = call @Unknown49(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %77 = mhlo.convolution(%75, %76) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %78:3 = call @BatchNormTrainingOp45(%77, %arg59, %arg58) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %79 = call @Unknown51(%78#0, %71) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %80 = call @Unknown58(%79) : (tensor<1x512x7x7xf16>) -> tensor<1x512xf16> + %81 = call @Unknown59(%80) : (tensor<1x512xf16>) -> tensor<1x512xf16> + %82 = call @Unknown60(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> + %83 = "mhlo.transpose"(%82) {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1000x512xf16>) -> tensor<512x1000xf16> + %84 = "mhlo.dot_general"(%81, %82) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<1x512xf16>, tensor<1000x512xf16>) -> tensor<1x1000xf16> + %85 = call @Unknown61(%arg3, %84) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16> + %86 = call @Unknown62(%4#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %87 = call @Unknown62(%4#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %88 = call @Unknown62(%9#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %89 = call @Unknown62(%9#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %90 = call @Unknown62(%13#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %91 = call @Unknown62(%13#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %92 = call @Unknown62(%17#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %93 = call @Unknown62(%17#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %94 = call @Unknown62(%21#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %95 = call @Unknown62(%21#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %96 = call @Unknown72(%28#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %97 = call @Unknown72(%28#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %98 = call @Unknown72(%32#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %99 = call @Unknown72(%32#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %100 = call @Unknown72(%25#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %101 = call @Unknown72(%25#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %102 = call @Unknown72(%36#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %103 = call @Unknown72(%36#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %104 = call @Unknown72(%40#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %105 = call @Unknown72(%40#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %106 = call @Unknown82(%47#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %107 = call @Unknown82(%47#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %108 = call @Unknown82(%51#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %109 = call @Unknown82(%51#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %110 = call @Unknown82(%44#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %111 = call @Unknown82(%44#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %112 = call @Unknown82(%55#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %113 = call @Unknown82(%55#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %114 = call @Unknown82(%59#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %115 = call @Unknown82(%59#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %116 = call @Unknown92(%66#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %117 = call @Unknown92(%66#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %118 = call @Unknown92(%70#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %119 = call @Unknown92(%70#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %120 = call @Unknown92(%63#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %121 = call @Unknown92(%63#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %122 = call @Unknown92(%74#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %123 = call @Unknown92(%74#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %124 = call @Unknown92(%78#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %125 = call @Unknown92(%78#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + return %85, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, %123, %124, %125, %2, %1, %3, %5, %6, %7, %8, %10, %11, %12, %14, %15, %16, %18, %19, %20, %22, %26, %27, %29, %30, %31, %23, %24, %33, %34, %35, %37, %38, %39, %41, %45, %46, %48, %49, %50, %42, %43, %52, %53, %54, %56, %57, %58, %60, %64, %65, %67, %68, %69, %61, %62, %71, %72, %73, %75, %76, %77, %79, %81, %83 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/FW/3_byre_tensor_opt.mlir b/compiler/test/E2E/ResNet18/FW/3_byre_tensor_opt.mlir index 7eb0e9afa..2a075b46d 100644 --- a/compiler/test/E2E/ResNet18/FW/3_byre_tensor_opt.mlir +++ b/compiler/test/E2E/ResNet18/FW/3_byre_tensor_opt.mlir @@ -2,26 +2,65 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1) -> (d0, d1)> -#map2 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> +#map1 = affine_map<(d0) -> (d0 mod 64, 49)> +#map2 = affine_map<(d0) -> (d0 mod 64 + 1, 49)> +#map3 = affine_map<(d0, d1) -> (d0 - d1)> +#map4 = affine_map<(d0) -> (d0 * 2)> +#map5 = affine_map<(d0) -> (d0 * 2 + 1)> module { func.func private @Unknown0(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16> attributes {__byteir_elementwise_fusion__} { + %c224 = arith.constant 224 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1x3x224x224xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x3x224x224xf32>) outs(%0 : tensor<1x3x224x224xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<1x3x224x224xf16> + %1 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %0) -> (tensor<1x3x224x224xf16>) { + %2 = scf.for %arg3 = %c0 to %c224 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x3x224x224xf16>) { + %3 = scf.for %arg5 = %c0 to %c224 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x3x224x224xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x3x224x224xf32> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f32, %out: f16): + %6 = arith.truncf %in : f32 to f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x3x224x224xf16> + scf.yield %inserted_slice : tensor<1x3x224x224xf16> + } + scf.yield %3 : tensor<1x3x224x224xf16> + } + scf.yield %2 : tensor<1x3x224x224xf16> + } return %1 : tensor<1x3x224x224xf16> } func.func private @Unknown1(%arg0: tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x3x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf32>) outs(%0 : tensor<64x3x7x7xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x3x7x7xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf16>) { + %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf16>) { + %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf16>) { + %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x3x7x7xf16> + scf.yield %inserted_slice : tensor<64x3x7x7xf16> + } + scf.yield %4 : tensor<64x3x7x7xf16> + } + scf.yield %3 : tensor<64x3x7x7xf16> + } + scf.yield %2 : tensor<64x3x7x7xf16> + } return %1 : tensor<64x3x7x7xf16> } func.func private @BatchNormTrainingOp2(%arg0: tensor<1x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { @@ -31,22 +70,57 @@ module { return %1, %batch_mean, %batch_var : tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32> } func.func private @Unknown3(%arg0: tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + %c112 = arith.constant 112 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x112x112xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x112x112xf16>) outs(%0 : tensor<1x64x112x112xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x64x112x112xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<1x64x112x112xf16>) { + %2 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x64x112x112xf16>) { + %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x112x112xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %out: f16): + %6 = arith.maximumf %in, %cst : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x112x112xf16> + scf.yield %inserted_slice : tensor<1x64x112x112xf16> + } + scf.yield %3 : tensor<1x64x112x112xf16> + } + scf.yield %2 : tensor<1x64x112x112xf16> + } return %1 : tensor<1x64x112x112xf16> } func.func private @Unknown4(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x64x3x3xf16> + scf.yield %inserted_slice : tensor<64x64x3x3xf16> + } + scf.yield %4 : tensor<64x64x3x3xf16> + } + scf.yield %3 : tensor<64x64x3x3xf16> + } + scf.yield %2 : tensor<64x64x3x3xf16> + } return %1 : tensor<64x64x3x3xf16> } func.func private @BatchNormTrainingOp5(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { @@ -56,99 +130,79 @@ module { return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> } func.func private @Unknown6(%arg0: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x64x56x56xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<1x64x56x56xf16>) { + %2 = scf.for %arg3 = %c0 to %c56 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x64x56x56xf16>) { + %3 = scf.for %arg5 = %c0 to %c56 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %out: f16): + %6 = arith.maximumf %in, %cst : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x56x56xf16> + scf.yield %inserted_slice : tensor<1x64x56x56xf16> + } + scf.yield %3 : tensor<1x64x56x56xf16> + } + scf.yield %2 : tensor<1x64x56x56xf16> + } return %1 : tensor<1x64x56x56xf16> } - func.func private @Unknown7(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @BatchNormTrainingOp8(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } func.func private @Unknown9(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x64x56x56xf16> - return %1 : tensor<1x64x56x56xf16> - } - func.func private @Unknown10(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @BatchNormTrainingOp11(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @Unknown12(%arg0: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x64x56x56xf16> - return %1 : tensor<1x64x56x56xf16> - } - func.func private @Unknown13(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @BatchNormTrainingOp14(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16> - return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @Unknown15(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x64x56x56xf16> + %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) { + %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) { + %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.addf %in, %in_1 : f16 + %7 = arith.maximumf %6, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x56x56xf16> + scf.yield %inserted_slice : tensor<1x64x56x56xf16> + } + scf.yield %3 : tensor<1x64x56x56xf16> + } + scf.yield %2 : tensor<1x64x56x56xf16> + } return %1 : tensor<1x64x56x56xf16> } func.func private @Unknown16(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf32>) outs(%0 : tensor<128x64x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x64x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x1x1xf16> + scf.yield %inserted_slice : tensor<128x64x1x1xf16> + } + scf.yield %2 : tensor<128x64x1x1xf16> + } return %1 : tensor<128x64x1x1xf16> } func.func private @BatchNormTrainingOp17(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { @@ -158,114 +212,136 @@ module { return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> } func.func private @Unknown18(%arg0: tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf32>) outs(%0 : tensor<128x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x64x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x3x3xf16> + scf.yield %inserted_slice : tensor<128x64x3x3xf16> + } + scf.yield %4 : tensor<128x64x3x3xf16> + } + scf.yield %3 : tensor<128x64x3x3xf16> + } + scf.yield %2 : tensor<128x64x3x3xf16> + } return %1 : tensor<128x64x3x3xf16> } - func.func private @BatchNormTrainingOp19(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } func.func private @Unknown20(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x128x28x28xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<1x128x28x28xf16>) { + %2 = scf.for %arg3 = %c0 to %c28 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x128x28x28xf16>) { + %3 = scf.for %arg5 = %c0 to %c28 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %out: f16): + %6 = arith.maximumf %in, %cst : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x128x28x28xf16> + scf.yield %inserted_slice : tensor<1x128x28x28xf16> + } + scf.yield %3 : tensor<1x128x28x28xf16> + } + scf.yield %2 : tensor<1x128x28x28xf16> + } return %1 : tensor<1x128x28x28xf16> } func.func private @Unknown21(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x128x3x3xf16> + scf.yield %inserted_slice : tensor<128x128x3x3xf16> + } + scf.yield %4 : tensor<128x128x3x3xf16> + } + scf.yield %3 : tensor<128x128x3x3xf16> + } + scf.yield %2 : tensor<128x128x3x3xf16> + } return %1 : tensor<128x128x3x3xf16> } - func.func private @BatchNormTrainingOp22(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } func.func private @Unknown23(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x128x28x28xf16> - return %1 : tensor<1x128x28x28xf16> - } - func.func private @Unknown24(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @BatchNormTrainingOp25(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @Unknown26(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x128x28x28xf16> - return %1 : tensor<1x128x28x28xf16> - } - func.func private @Unknown27(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @BatchNormTrainingOp28(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16> - return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @Unknown29(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x128x28x28xf16> + %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<1x128x28x28xf16>) { + %2 = scf.for %arg4 = %c0 to %c28 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x128x28x28xf16>) { + %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.addf %in, %in_1 : f16 + %7 = arith.maximumf %6, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x128x28x28xf16> + scf.yield %inserted_slice : tensor<1x128x28x28xf16> + } + scf.yield %3 : tensor<1x128x28x28xf16> + } + scf.yield %2 : tensor<1x128x28x28xf16> + } return %1 : tensor<1x128x28x28xf16> } func.func private @Unknown30(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf32>) outs(%0 : tensor<256x128x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x128x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x1x1xf16> + scf.yield %inserted_slice : tensor<256x128x1x1xf16> + } + scf.yield %2 : tensor<256x128x1x1xf16> + } return %1 : tensor<256x128x1x1xf16> } func.func private @BatchNormTrainingOp31(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { @@ -275,114 +351,136 @@ module { return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> } func.func private @Unknown32(%arg0: tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf32>) outs(%0 : tensor<256x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x128x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x3x3xf16> + scf.yield %inserted_slice : tensor<256x128x3x3xf16> + } + scf.yield %4 : tensor<256x128x3x3xf16> + } + scf.yield %3 : tensor<256x128x3x3xf16> + } + scf.yield %2 : tensor<256x128x3x3xf16> + } return %1 : tensor<256x128x3x3xf16> } - func.func private @BatchNormTrainingOp33(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } func.func private @Unknown34(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x256x14x14xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<1x256x14x14xf16>) { + %2 = scf.for %arg3 = %c0 to %c14 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x256x14x14xf16>) { + %3 = scf.for %arg5 = %c0 to %c14 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %out: f16): + %6 = arith.maximumf %in, %cst : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x256x14x14xf16> + scf.yield %inserted_slice : tensor<1x256x14x14xf16> + } + scf.yield %3 : tensor<1x256x14x14xf16> + } + scf.yield %2 : tensor<1x256x14x14xf16> + } return %1 : tensor<1x256x14x14xf16> } func.func private @Unknown35(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x256x3x3xf16> + scf.yield %inserted_slice : tensor<256x256x3x3xf16> + } + scf.yield %4 : tensor<256x256x3x3xf16> + } + scf.yield %3 : tensor<256x256x3x3xf16> + } + scf.yield %2 : tensor<256x256x3x3xf16> + } return %1 : tensor<256x256x3x3xf16> } - func.func private @BatchNormTrainingOp36(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } func.func private @Unknown37(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x256x14x14xf16> - return %1 : tensor<1x256x14x14xf16> - } - func.func private @Unknown38(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @BatchNormTrainingOp39(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @Unknown40(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x256x14x14xf16> - return %1 : tensor<1x256x14x14xf16> - } - func.func private @Unknown41(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @BatchNormTrainingOp42(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16> - return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @Unknown43(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x256x14x14xf16> + %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<1x256x14x14xf16>) { + %2 = scf.for %arg4 = %c0 to %c14 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x256x14x14xf16>) { + %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.addf %in, %in_1 : f16 + %7 = arith.maximumf %6, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x256x14x14xf16> + scf.yield %inserted_slice : tensor<1x256x14x14xf16> + } + scf.yield %3 : tensor<1x256x14x14xf16> + } + scf.yield %2 : tensor<1x256x14x14xf16> + } return %1 : tensor<1x256x14x14xf16> } func.func private @Unknown44(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf32>) outs(%0 : tensor<512x256x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x256x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x1x1xf16> + scf.yield %inserted_slice : tensor<512x256x1x1xf16> + } + scf.yield %2 : tensor<512x256x1x1xf16> + } return %1 : tensor<512x256x1x1xf16> } func.func private @BatchNormTrainingOp45(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { @@ -392,793 +490,517 @@ module { return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> } func.func private @Unknown46(%arg0: tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf32>) outs(%0 : tensor<512x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x256x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x3x3xf16> + scf.yield %inserted_slice : tensor<512x256x3x3xf16> + } + scf.yield %4 : tensor<512x256x3x3xf16> + } + scf.yield %3 : tensor<512x256x3x3xf16> + } + scf.yield %2 : tensor<512x256x3x3xf16> + } return %1 : tensor<512x256x3x3xf16> } - func.func private @BatchNormTrainingOp47(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } func.func private @Unknown48(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x512x7x7xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<1x512x7x7xf16>) { + %2 = scf.for %arg3 = %c0 to %c7 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x512x7x7xf16>) { + %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %out: f16): + %6 = arith.maximumf %in, %cst : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x512x7x7xf16> + scf.yield %inserted_slice : tensor<1x512x7x7xf16> + } + scf.yield %3 : tensor<1x512x7x7xf16> + } + scf.yield %2 : tensor<1x512x7x7xf16> + } return %1 : tensor<1x512x7x7xf16> } func.func private @Unknown49(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x512x3x3xf16> + scf.yield %inserted_slice : tensor<512x512x3x3xf16> + } + scf.yield %4 : tensor<512x512x3x3xf16> + } + scf.yield %3 : tensor<512x512x3x3xf16> + } + scf.yield %2 : tensor<512x512x3x3xf16> + } return %1 : tensor<512x512x3x3xf16> } - func.func private @BatchNormTrainingOp50(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } func.func private @Unknown51(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x512x7x7xf16> - return %1 : tensor<1x512x7x7xf16> - } - func.func private @Unknown52(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @BatchNormTrainingOp53(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @Unknown54(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x512x7x7xf16> + %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) { + %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) { + %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.addf %in, %in_1 : f16 + %7 = arith.maximumf %6, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x512x7x7xf16> + scf.yield %inserted_slice : tensor<1x512x7x7xf16> + } + scf.yield %3 : tensor<1x512x7x7xf16> + } + scf.yield %2 : tensor<1x512x7x7xf16> + } return %1 : tensor<1x512x7x7xf16> } - func.func private @Unknown55(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @BatchNormTrainingOp56(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16> - return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @Unknown57(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown58(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512xf16> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x512x7x7xf16> - return %1 : tensor<1x512x7x7xf16> - } - func.func private @Unknown58(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} { + %collapsed = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] : tensor<1x512x7x7xf16> into tensor<512x49xf16> + %0 = tensor.empty() : tensor<512xf16> + %1 = scf.forall (%arg1) in (512) shared_outs(%arg2 = %0) -> (tensor<512xf16>) { + %extracted_slice = tensor.extract_slice %collapsed[%arg1, 0] [1, 49] [1, 1] : tensor<512x49xf16> to tensor<49xf16> + %expanded_0 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<49xf16> into tensor<1x49xf16> + %extracted_slice_1 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<512xf16> to tensor + %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf16> + %3 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %2) -> (tensor<64xf16>) { + %15 = affine.min #map1(%arg3) + %16 = affine.min #map2(%arg3) + %17 = affine.apply #map3(%16, %15) + %extracted_slice_7 = tensor.extract_slice %expanded_0[0, %15] [1, %17] [1, 1] : tensor<1x49xf16> to tensor + %expanded_8 = tensor.expand_shape %extracted_slice_7 [[0, 1]] : tensor into tensor<1x?xf16> + %dim = tensor.dim %expanded_8, %c1 : tensor<1x?xf16> + %18 = arith.cmpi ugt, %dim, %c0 : index + %19 = scf.if %18 -> (f16) { + %extracted = tensor.extract %expanded_8[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %20 = arith.addf %19, %cst : f16 + %extracted_slice_9 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor + %inserted = tensor.insert %20 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<64xf16> + } + } {mapping = [#gpu.thread]} + %expanded_2 = tensor.expand_shape %3 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16> + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf16> + %5 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf16>) { + %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<32x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_2[%arg3, %c1] : tensor<32x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf16> + } + } {mapping = [#gpu.thread]} + %expanded_3 = tensor.expand_shape %5 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf16> + %7 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %6) -> (tensor<16xf16>) { + %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<16x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_3[%arg3, %c1] : tensor<16x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<16xf16> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %7 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf16> + %9 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %8) -> (tensor<8xf16>) { + %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<8x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_4[%arg3, %c1] : tensor<8x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<8xf16> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %9 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf16> + %11 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %10) -> (tensor<4xf16>) { + %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<4x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_5[%arg3, %c1] : tensor<4x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %11 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf16> + %13 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %12) -> (tensor<2xf16>) { + %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<2x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_6[%arg3, %c1] : tensor<2x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<2xf16> + } + } {mapping = [#gpu.thread]} + %14 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_1) -> (tensor) { + %15 = affine.apply #map4(%arg3) + %extracted = tensor.extract %13[%15] : tensor<2xf16> + %16 = arith.addf %extracted, %cst : f16 + %17 = affine.apply #map5(%arg3) + %extracted_7 = tensor.extract %13[%17] : tensor<2xf16> + %18 = arith.addf %extracted_7, %16 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[] [] [] : tensor to tensor + %inserted = tensor.insert %18 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %14 into %arg2[%arg1] [1] [1] : tensor into tensor<512xf16> + } + } {mapping = [#gpu.block]} + %expanded = tensor.expand_shape %1 [[0, 1]] : tensor<512xf16> into tensor<1x512xf16> + return %expanded : tensor<1x512xf16> + } + func.func private @Unknown59(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 2.040100e-02 : f16 %0 = tensor.empty() : tensor<1x512xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x512xf16>) outs(%0 : tensor<1x512xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.mulf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x512xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<1x512xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1] [1, 1] [1, 1] : tensor<1x512xf16> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%2 : tensor) { + ^bb0(%in: f16, %out: f16): + %4 = arith.mulf %in, %cst : f16 + linalg.yield %4 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg2[0, %arg1] [1, 1] [1, 1] : tensor into tensor<1x512xf16> + scf.yield %inserted_slice : tensor<1x512xf16> + } return %1 : tensor<1x512xf16> } - func.func private @Unknown59(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown60(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1000x512xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf32>) outs(%0 : tensor<1000x512xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<1000x512xf16> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf16>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<1000x512xf16> + scf.yield %inserted_slice : tensor<1000x512xf16> + } + scf.yield %2 : tensor<1000x512xf16> + } return %1 : tensor<1000x512xf16> } - func.func private @Unknown60(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} { - %expanded = tensor.expand_shape %arg0 [[0, 1]] : tensor<1000xf32> into tensor<1x1000xf32> + func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1x1000xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %expanded : tensor<1x1000xf16>, tensor<1x1000xf32>) outs(%0 : tensor<1x1000xf16>) { - ^bb0(%in: f16, %in_0: f32, %out: f16): - %2 = arith.truncf %in_0 : f32 to f16 - %3 = arith.addf %in, %2 : f16 - linalg.yield %3 : f16 - } -> tensor<1x1000xf16> + %1 = scf.for %arg2 = %c0 to %c1000 step %c1 iter_args(%arg3 = %0) -> (tensor<1x1000xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<1000xf32> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2] [1, 1] [1, 1] : tensor<1x1000xf16> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %in_1: f16, %out: f16): + %4 = arith.truncf %in : f32 to f16 + %5 = arith.addf %in_1, %4 : f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg3[0, %arg2] [1, 1] [1, 1] : tensor into tensor<1x1000xf16> + scf.yield %inserted_slice : tensor<1x1000xf16> + } return %1 : tensor<1x1000xf16> } - func.func private @Unknown61(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } func.func private @Unknown62(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown63(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown64(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown65(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown66(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown67(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown68(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown69(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown70(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> + %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<64xf32>) { + %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<64xf32> to tensor + %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<64xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %4 = arith.mulf %in, %cst : f32 + %5 = arith.mulf %in_2, %cst_0 : f32 + %6 = arith.addf %5, %4 : f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor into tensor<64xf32> + scf.yield %inserted_slice : tensor<64xf32> + } return %1 : tensor<64xf32> } - func.func private @Unknown71(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } func.func private @Unknown72(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown73(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown74(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown75(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown76(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown77(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown78(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown79(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown80(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> + %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<128xf32>) { + %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<128xf32> to tensor + %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<128xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %4 = arith.mulf %in, %cst : f32 + %5 = arith.mulf %in_2, %cst_0 : f32 + %6 = arith.addf %5, %4 : f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor into tensor<128xf32> + scf.yield %inserted_slice : tensor<128xf32> + } return %1 : tensor<128xf32> } - func.func private @Unknown81(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } func.func private @Unknown82(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown83(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown84(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown85(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown86(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown87(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown88(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown89(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown90(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> + %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<256xf32>) { + %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<256xf32> to tensor + %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<256xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %4 = arith.mulf %in, %cst : f32 + %5 = arith.mulf %in_2, %cst_0 : f32 + %6 = arith.addf %5, %4 : f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor into tensor<256xf32> + scf.yield %inserted_slice : tensor<256xf32> + } return %1 : tensor<256xf32> } - func.func private @Unknown91(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } func.func private @Unknown92(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown93(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown94(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown95(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown96(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown97(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown98(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown99(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown100(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> + %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<512xf32>) { + %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<512xf32> to tensor + %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<512xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %4 = arith.mulf %in, %cst : f32 + %5 = arith.mulf %in_2, %cst_0 : f32 + %6 = arith.addf %5, %4 : f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor into tensor<512xf32> + scf.yield %inserted_slice : tensor<512xf32> + } return %1 : tensor<512xf32> } func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<1000xf32>, %arg4: tensor<1000x512xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64x64x3x3xf32>, %arg10: tensor<64x64x3x3xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64x64x3x3xf32>, %arg16: tensor<64x64x3x3xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<128xf32>, %arg21: tensor<128x64x3x3xf32>, %arg22: tensor<128x128x3x3xf32>, %arg23: tensor<128x64x1x1xf32>, %arg24: tensor<128xf32>, %arg25: tensor<128xf32>, %arg26: tensor<128xf32>, %arg27: tensor<128xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128x128x3x3xf32>, %arg31: tensor<128x128x3x3xf32>, %arg32: tensor<256xf32>, %arg33: tensor<256xf32>, %arg34: tensor<256xf32>, %arg35: tensor<256xf32>, %arg36: tensor<256x128x3x3xf32>, %arg37: tensor<256x256x3x3xf32>, %arg38: tensor<256x128x1x1xf32>, %arg39: tensor<256xf32>, %arg40: tensor<256xf32>, %arg41: tensor<256xf32>, %arg42: tensor<256xf32>, %arg43: tensor<256xf32>, %arg44: tensor<256xf32>, %arg45: tensor<256x256x3x3xf32>, %arg46: tensor<256x256x3x3xf32>, %arg47: tensor<512xf32>, %arg48: tensor<512xf32>, %arg49: tensor<512xf32>, %arg50: tensor<512xf32>, %arg51: tensor<512x256x3x3xf32>, %arg52: tensor<512x512x3x3xf32>, %arg53: tensor<512x256x1x1xf32>, %arg54: tensor<512xf32>, %arg55: tensor<512xf32>, %arg56: tensor<512xf32>, %arg57: tensor<512xf32>, %arg58: tensor<512xf32>, %arg59: tensor<512xf32>, %arg60: tensor<512x512x3x3xf32>, %arg61: tensor<512x512x3x3xf32>, %arg62: tensor, %arg63: tensor<64xf32>, %arg64: tensor<64xf32>, %arg65: tensor, %arg66: tensor<64xf32>, %arg67: tensor<64xf32>, %arg68: tensor, %arg69: tensor<64xf32>, %arg70: tensor<64xf32>, %arg71: tensor, %arg72: tensor<64xf32>, %arg73: tensor<64xf32>, %arg74: tensor, %arg75: tensor<64xf32>, %arg76: tensor<64xf32>, %arg77: tensor, %arg78: tensor<128xf32>, %arg79: tensor<128xf32>, %arg80: tensor, %arg81: tensor<128xf32>, %arg82: tensor<128xf32>, %arg83: tensor, %arg84: tensor<128xf32>, %arg85: tensor<128xf32>, %arg86: tensor, %arg87: tensor<128xf32>, %arg88: tensor<128xf32>, %arg89: tensor, %arg90: tensor<128xf32>, %arg91: tensor<128xf32>, %arg92: tensor, %arg93: tensor<256xf32>, %arg94: tensor<256xf32>, %arg95: tensor, %arg96: tensor<256xf32>, %arg97: tensor<256xf32>, %arg98: tensor, %arg99: tensor<256xf32>, %arg100: tensor<256xf32>, %arg101: tensor, %arg102: tensor<256xf32>, %arg103: tensor<256xf32>, %arg104: tensor, %arg105: tensor<256xf32>, %arg106: tensor<256xf32>, %arg107: tensor, %arg108: tensor<512xf32>, %arg109: tensor<512xf32>, %arg110: tensor, %arg111: tensor<512xf32>, %arg112: tensor<512xf32>, %arg113: tensor, %arg114: tensor<512xf32>, %arg115: tensor<512xf32>, %arg116: tensor, %arg117: tensor<512xf32>, %arg118: tensor<512xf32>, %arg119: tensor, %arg120: tensor<512xf32>, %arg121: tensor<512xf32>, %arg122: tensor<1x3x224x224xf32>) -> (tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>) { - %0 = mhlo.constant dense<0.000000e+00> : tensor - %1 = mhlo.constant dense<0xFC00> : tensor - %2 = call @Unknown0(%arg122) : (tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16> - %3 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> - %4 = mhlo.convolution(%2, %3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<1x64x112x112xf16> - %5:3 = call @BatchNormTrainingOp2(%4, %arg1, %arg0) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) - %6 = call @Unknown3(%5#0) : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> - %7 = "mhlo.reduce_window"(%6, %1) ({ + %0 = mhlo.constant dense<0xFC00> : tensor + %1 = call @Unknown0(%arg122) : (tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16> + %2 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> + %3 = mhlo.convolution(%1, %2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<1x64x112x112xf16> + %4:3 = call @BatchNormTrainingOp2(%3, %arg1, %arg0) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) + %5 = call @Unknown3(%4#0) : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> + %6 = "mhlo.reduce_window"(%5, %0) ({ ^bb0(%arg123: tensor, %arg124: tensor): - %127 = mhlo.maximum %arg123, %arg124 : tensor - mhlo.return %127 : tensor + %126 = mhlo.maximum %arg123, %arg124 : tensor + mhlo.return %126 : tensor }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<1x64x112x112xf16>, tensor) -> tensor<1x64x56x56xf16> - %8 = call @Unknown4(%arg9) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %9 = mhlo.convolution(%7, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %10:3 = call @BatchNormTrainingOp5(%9, %arg6, %arg5) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %11 = call @Unknown6(%10#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %12 = call @Unknown7(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %13 = mhlo.convolution(%11, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %14:3 = call @BatchNormTrainingOp8(%13, %arg8, %arg7) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %15 = call @Unknown9(%14#0, %7) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %16 = call @Unknown10(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %17 = mhlo.convolution(%15, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %18:3 = call @BatchNormTrainingOp11(%17, %arg12, %arg11) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %19 = call @Unknown12(%18#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %20 = call @Unknown13(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %21 = mhlo.convolution(%19, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> - %22:3 = call @BatchNormTrainingOp14(%21, %arg14, %arg13) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %23 = call @Unknown15(%22#0, %15) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %24 = call @Unknown16(%arg23) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> - %25 = mhlo.convolution(%23, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<1x128x28x28xf16> - %26:3 = call @BatchNormTrainingOp17(%25, %arg25, %arg24) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %27 = call @Unknown18(%arg21) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> - %28 = mhlo.convolution(%23, %27) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<1x128x28x28xf16> - %29:3 = call @BatchNormTrainingOp19(%28, %arg18, %arg17) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %30 = call @Unknown20(%29#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %31 = call @Unknown21(%arg22) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %32 = mhlo.convolution(%30, %31) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %33:3 = call @BatchNormTrainingOp22(%32, %arg20, %arg19) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %34 = call @Unknown23(%33#0, %26#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %35 = call @Unknown24(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %36 = mhlo.convolution(%34, %35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %37:3 = call @BatchNormTrainingOp25(%36, %arg27, %arg26) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %38 = call @Unknown26(%37#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %39 = call @Unknown27(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %40 = mhlo.convolution(%38, %39) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> - %41:3 = call @BatchNormTrainingOp28(%40, %arg29, %arg28) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %42 = call @Unknown29(%41#0, %34) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %43 = call @Unknown30(%arg38) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> - %44 = mhlo.convolution(%42, %43) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<1x256x14x14xf16> - %45:3 = call @BatchNormTrainingOp31(%44, %arg40, %arg39) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %46 = call @Unknown32(%arg36) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> - %47 = mhlo.convolution(%42, %46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<1x256x14x14xf16> - %48:3 = call @BatchNormTrainingOp33(%47, %arg33, %arg32) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %49 = call @Unknown34(%48#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %50 = call @Unknown35(%arg37) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %51 = mhlo.convolution(%49, %50) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %52:3 = call @BatchNormTrainingOp36(%51, %arg35, %arg34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %53 = call @Unknown37(%52#0, %45#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %54 = call @Unknown38(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %55 = mhlo.convolution(%53, %54) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %56:3 = call @BatchNormTrainingOp39(%55, %arg42, %arg41) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %57 = call @Unknown40(%56#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %58 = call @Unknown41(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %59 = mhlo.convolution(%57, %58) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> - %60:3 = call @BatchNormTrainingOp42(%59, %arg44, %arg43) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %61 = call @Unknown43(%60#0, %53) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %62 = call @Unknown44(%arg53) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> - %63 = mhlo.convolution(%61, %62) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<1x512x7x7xf16> - %64:3 = call @BatchNormTrainingOp45(%63, %arg55, %arg54) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %65 = call @Unknown46(%arg51) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> - %66 = mhlo.convolution(%61, %65) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<1x512x7x7xf16> - %67:3 = call @BatchNormTrainingOp47(%66, %arg48, %arg47) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %68 = call @Unknown48(%67#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %69 = call @Unknown49(%arg52) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %70 = mhlo.convolution(%68, %69) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %71:3 = call @BatchNormTrainingOp50(%70, %arg50, %arg49) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %72 = call @Unknown51(%71#0, %64#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %73 = call @Unknown52(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %74 = mhlo.convolution(%72, %73) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %75:3 = call @BatchNormTrainingOp53(%74, %arg57, %arg56) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %76 = call @Unknown54(%75#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %77 = call @Unknown55(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %78 = mhlo.convolution(%76, %77) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> - %79:3 = call @BatchNormTrainingOp56(%78, %arg59, %arg58) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %80 = call @Unknown57(%79#0, %72) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %81 = mhlo.reduce(%80 init: %0) across dimensions = [3, 2] : (tensor<1x512x7x7xf16>, tensor) -> tensor<1x512xf16> - reducer(%arg123: tensor, %arg124: tensor) { - %127 = mhlo.add %arg123, %arg124 : tensor - mhlo.return %127 : tensor - } - %82 = call @Unknown58(%81) : (tensor<1x512xf16>) -> tensor<1x512xf16> - %83 = call @Unknown59(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> - %84 = "mhlo.transpose"(%83) {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1000x512xf16>) -> tensor<512x1000xf16> - %85 = "mhlo.dot_general"(%82, %83) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<1x512xf16>, tensor<1000x512xf16>) -> tensor<1x1000xf16> - %86 = call @Unknown60(%arg3, %85) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16> - %87 = call @Unknown61(%5#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %88 = call @Unknown62(%5#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %89 = call @Unknown63(%10#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %90 = call @Unknown64(%10#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %91 = call @Unknown65(%14#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %92 = call @Unknown66(%14#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %93 = call @Unknown67(%18#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %94 = call @Unknown68(%18#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %95 = call @Unknown69(%22#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %96 = call @Unknown70(%22#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %97 = call @Unknown71(%29#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %98 = call @Unknown72(%29#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %99 = call @Unknown73(%33#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %100 = call @Unknown74(%33#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %101 = call @Unknown75(%26#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %102 = call @Unknown76(%26#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %103 = call @Unknown77(%37#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %104 = call @Unknown78(%37#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %105 = call @Unknown79(%41#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %106 = call @Unknown80(%41#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %107 = call @Unknown81(%48#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %108 = call @Unknown82(%48#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %109 = call @Unknown83(%52#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %110 = call @Unknown84(%52#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %111 = call @Unknown85(%45#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %112 = call @Unknown86(%45#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %113 = call @Unknown87(%56#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %114 = call @Unknown88(%56#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %115 = call @Unknown89(%60#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %116 = call @Unknown90(%60#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %117 = call @Unknown91(%67#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %118 = call @Unknown92(%67#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %119 = call @Unknown93(%71#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %120 = call @Unknown94(%71#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %121 = call @Unknown95(%64#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %122 = call @Unknown96(%64#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %123 = call @Unknown97(%75#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %124 = call @Unknown98(%75#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %125 = call @Unknown99(%79#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %126 = call @Unknown100(%79#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - return %86, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %3, %2, %4, %6, %7, %8, %9, %11, %12, %13, %15, %16, %17, %19, %20, %21, %23, %27, %28, %30, %31, %32, %24, %25, %34, %35, %36, %38, %39, %40, %42, %46, %47, %49, %50, %51, %43, %44, %53, %54, %55, %57, %58, %59, %61, %65, %66, %68, %69, %70, %62, %63, %72, %73, %74, %76, %77, %78, %80, %82, %84 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16> + %7 = call @Unknown4(%arg9) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %8 = mhlo.convolution(%6, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %9:3 = call @BatchNormTrainingOp5(%8, %arg6, %arg5) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %10 = call @Unknown6(%9#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %11 = call @Unknown4(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %12 = mhlo.convolution(%10, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %13:3 = call @BatchNormTrainingOp5(%12, %arg8, %arg7) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %14 = call @Unknown9(%13#0, %6) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %15 = call @Unknown4(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %16 = mhlo.convolution(%14, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %17:3 = call @BatchNormTrainingOp5(%16, %arg12, %arg11) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %18 = call @Unknown6(%17#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %19 = call @Unknown4(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %20 = mhlo.convolution(%18, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> + %21:3 = call @BatchNormTrainingOp5(%20, %arg14, %arg13) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %22 = call @Unknown9(%21#0, %14) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %23 = call @Unknown16(%arg23) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> + %24 = mhlo.convolution(%22, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<1x128x28x28xf16> + %25:3 = call @BatchNormTrainingOp17(%24, %arg25, %arg24) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %26 = call @Unknown18(%arg21) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> + %27 = mhlo.convolution(%22, %26) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<1x128x28x28xf16> + %28:3 = call @BatchNormTrainingOp17(%27, %arg18, %arg17) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %29 = call @Unknown20(%28#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %30 = call @Unknown21(%arg22) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %31 = mhlo.convolution(%29, %30) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %32:3 = call @BatchNormTrainingOp17(%31, %arg20, %arg19) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %33 = call @Unknown23(%32#0, %25#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %34 = call @Unknown21(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %35 = mhlo.convolution(%33, %34) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %36:3 = call @BatchNormTrainingOp17(%35, %arg27, %arg26) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %37 = call @Unknown20(%36#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %38 = call @Unknown21(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %39 = mhlo.convolution(%37, %38) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> + %40:3 = call @BatchNormTrainingOp17(%39, %arg29, %arg28) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %41 = call @Unknown23(%40#0, %33) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %42 = call @Unknown30(%arg38) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> + %43 = mhlo.convolution(%41, %42) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<1x256x14x14xf16> + %44:3 = call @BatchNormTrainingOp31(%43, %arg40, %arg39) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %45 = call @Unknown32(%arg36) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> + %46 = mhlo.convolution(%41, %45) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<1x256x14x14xf16> + %47:3 = call @BatchNormTrainingOp31(%46, %arg33, %arg32) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %48 = call @Unknown34(%47#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %49 = call @Unknown35(%arg37) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %50 = mhlo.convolution(%48, %49) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %51:3 = call @BatchNormTrainingOp31(%50, %arg35, %arg34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %52 = call @Unknown37(%51#0, %44#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %53 = call @Unknown35(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %54 = mhlo.convolution(%52, %53) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %55:3 = call @BatchNormTrainingOp31(%54, %arg42, %arg41) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %56 = call @Unknown34(%55#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %57 = call @Unknown35(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %58 = mhlo.convolution(%56, %57) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> + %59:3 = call @BatchNormTrainingOp31(%58, %arg44, %arg43) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %60 = call @Unknown37(%59#0, %52) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %61 = call @Unknown44(%arg53) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> + %62 = mhlo.convolution(%60, %61) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<1x512x7x7xf16> + %63:3 = call @BatchNormTrainingOp45(%62, %arg55, %arg54) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %64 = call @Unknown46(%arg51) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> + %65 = mhlo.convolution(%60, %64) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<1x512x7x7xf16> + %66:3 = call @BatchNormTrainingOp45(%65, %arg48, %arg47) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %67 = call @Unknown48(%66#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %68 = call @Unknown49(%arg52) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %69 = mhlo.convolution(%67, %68) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %70:3 = call @BatchNormTrainingOp45(%69, %arg50, %arg49) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %71 = call @Unknown51(%70#0, %63#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %72 = call @Unknown49(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %73 = mhlo.convolution(%71, %72) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %74:3 = call @BatchNormTrainingOp45(%73, %arg57, %arg56) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %75 = call @Unknown48(%74#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %76 = call @Unknown49(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %77 = mhlo.convolution(%75, %76) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> + %78:3 = call @BatchNormTrainingOp45(%77, %arg59, %arg58) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %79 = call @Unknown51(%78#0, %71) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %80 = call @Unknown58(%79) : (tensor<1x512x7x7xf16>) -> tensor<1x512xf16> + %81 = call @Unknown59(%80) : (tensor<1x512xf16>) -> tensor<1x512xf16> + %82 = call @Unknown60(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> + %83 = "mhlo.transpose"(%82) {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1000x512xf16>) -> tensor<512x1000xf16> + %84 = "mhlo.dot_general"(%81, %82) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<1x512xf16>, tensor<1000x512xf16>) -> tensor<1x1000xf16> + %85 = call @Unknown61(%arg3, %84) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16> + %86 = call @Unknown62(%4#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %87 = call @Unknown62(%4#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %88 = call @Unknown62(%9#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %89 = call @Unknown62(%9#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %90 = call @Unknown62(%13#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %91 = call @Unknown62(%13#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %92 = call @Unknown62(%17#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %93 = call @Unknown62(%17#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %94 = call @Unknown62(%21#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %95 = call @Unknown62(%21#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %96 = call @Unknown72(%28#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %97 = call @Unknown72(%28#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %98 = call @Unknown72(%32#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %99 = call @Unknown72(%32#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %100 = call @Unknown72(%25#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %101 = call @Unknown72(%25#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %102 = call @Unknown72(%36#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %103 = call @Unknown72(%36#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %104 = call @Unknown72(%40#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %105 = call @Unknown72(%40#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %106 = call @Unknown82(%47#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %107 = call @Unknown82(%47#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %108 = call @Unknown82(%51#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %109 = call @Unknown82(%51#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %110 = call @Unknown82(%44#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %111 = call @Unknown82(%44#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %112 = call @Unknown82(%55#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %113 = call @Unknown82(%55#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %114 = call @Unknown82(%59#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %115 = call @Unknown82(%59#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %116 = call @Unknown92(%66#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %117 = call @Unknown92(%66#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %118 = call @Unknown92(%70#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %119 = call @Unknown92(%70#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %120 = call @Unknown92(%63#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %121 = call @Unknown92(%63#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %122 = call @Unknown92(%74#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %123 = call @Unknown92(%74#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %124 = call @Unknown92(%78#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %125 = call @Unknown92(%78#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + return %85, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, %123, %124, %125, %2, %1, %3, %5, %6, %7, %8, %10, %11, %12, %14, %15, %16, %18, %19, %20, %22, %26, %27, %29, %30, %31, %23, %24, %33, %34, %35, %37, %38, %39, %41, %45, %46, %48, %49, %50, %42, %43, %52, %53, %54, %56, %57, %58, %60, %64, %65, %67, %68, %69, %61, %62, %71, %72, %73, %75, %76, %77, %79, %81, %83 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/FW/4_bufferize_opt.mlir b/compiler/test/E2E/ResNet18/FW/4_bufferize_opt.mlir index 1e0da346e..7f8aff083 100644 --- a/compiler/test/E2E/ResNet18/FW/4_bufferize_opt.mlir +++ b/compiler/test/E2E/ResNet18/FW/4_bufferize_opt.mlir @@ -2,925 +2,841 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1) -> (d0, d1)> -#map2 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> +#map1 = affine_map<(d0) -> (d0 mod 64, 49)> +#map2 = affine_map<(d0) -> (d0 mod 64 + 1, 49)> +#map3 = affine_map<(d0, d1) -> (d0 - d1)> +#map4 = affine_map<(d0) -> (d0 * 2)> +#map5 = affine_map<(d0) -> (d0 * 2 + 1)> module { func.func private @Unknown0(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16> attributes {__byteir_elementwise_fusion__} { + %c224 = arith.constant 224 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1x3x224x224xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x3x224x224xf32>) outs(%0 : tensor<1x3x224x224xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<1x3x224x224xf16> + %1 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %0) -> (tensor<1x3x224x224xf16>) { + %2 = scf.for %arg3 = %c0 to %c224 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x3x224x224xf16>) { + %3 = scf.for %arg5 = %c0 to %c224 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x3x224x224xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x3x224x224xf32> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f32, %out: f16): + %6 = arith.truncf %in : f32 to f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x3x224x224xf16> + scf.yield %inserted_slice : tensor<1x3x224x224xf16> + } + scf.yield %3 : tensor<1x3x224x224xf16> + } + scf.yield %2 : tensor<1x3x224x224xf16> + } return %1 : tensor<1x3x224x224xf16> } func.func private @Unknown1(%arg0: tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x3x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf32>) outs(%0 : tensor<64x3x7x7xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x3x7x7xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf16>) { + %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf16>) { + %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf16>) { + %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x3x7x7xf16> + scf.yield %inserted_slice : tensor<64x3x7x7xf16> + } + scf.yield %4 : tensor<64x3x7x7xf16> + } + scf.yield %3 : tensor<64x3x7x7xf16> + } + scf.yield %2 : tensor<64x3x7x7xf16> + } return %1 : tensor<64x3x7x7xf16> } func.func private @Unknown3(%arg0: tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + %c112 = arith.constant 112 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x112x112xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x112x112xf16>) outs(%0 : tensor<1x64x112x112xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x64x112x112xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<1x64x112x112xf16>) { + %2 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x64x112x112xf16>) { + %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x112x112xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %out: f16): + %6 = arith.maximumf %in, %cst : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x112x112xf16> + scf.yield %inserted_slice : tensor<1x64x112x112xf16> + } + scf.yield %3 : tensor<1x64x112x112xf16> + } + scf.yield %2 : tensor<1x64x112x112xf16> + } return %1 : tensor<1x64x112x112xf16> } func.func private @Unknown4(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x64x3x3xf16> + scf.yield %inserted_slice : tensor<64x64x3x3xf16> + } + scf.yield %4 : tensor<64x64x3x3xf16> + } + scf.yield %3 : tensor<64x64x3x3xf16> + } + scf.yield %2 : tensor<64x64x3x3xf16> + } return %1 : tensor<64x64x3x3xf16> } func.func private @Unknown6(%arg0: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x64x56x56xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<1x64x56x56xf16>) { + %2 = scf.for %arg3 = %c0 to %c56 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x64x56x56xf16>) { + %3 = scf.for %arg5 = %c0 to %c56 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %out: f16): + %6 = arith.maximumf %in, %cst : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x56x56xf16> + scf.yield %inserted_slice : tensor<1x64x56x56xf16> + } + scf.yield %3 : tensor<1x64x56x56xf16> + } + scf.yield %2 : tensor<1x64x56x56xf16> + } return %1 : tensor<1x64x56x56xf16> } - func.func private @Unknown7(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } func.func private @Unknown9(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x64x56x56xf16> - return %1 : tensor<1x64x56x56xf16> - } - func.func private @Unknown10(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown12(%arg0: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x64x56x56xf16> - return %1 : tensor<1x64x56x56xf16> - } - func.func private @Unknown13(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown15(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x64x56x56xf16> + %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) { + %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) { + %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.addf %in, %in_1 : f16 + %7 = arith.maximumf %6, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x64x56x56xf16> + scf.yield %inserted_slice : tensor<1x64x56x56xf16> + } + scf.yield %3 : tensor<1x64x56x56xf16> + } + scf.yield %2 : tensor<1x64x56x56xf16> + } return %1 : tensor<1x64x56x56xf16> } func.func private @Unknown16(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf32>) outs(%0 : tensor<128x64x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x64x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x1x1xf16> + scf.yield %inserted_slice : tensor<128x64x1x1xf16> + } + scf.yield %2 : tensor<128x64x1x1xf16> + } return %1 : tensor<128x64x1x1xf16> } func.func private @Unknown18(%arg0: tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf32>) outs(%0 : tensor<128x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x64x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x3x3xf16> + scf.yield %inserted_slice : tensor<128x64x3x3xf16> + } + scf.yield %4 : tensor<128x64x3x3xf16> + } + scf.yield %3 : tensor<128x64x3x3xf16> + } + scf.yield %2 : tensor<128x64x3x3xf16> + } return %1 : tensor<128x64x3x3xf16> } func.func private @Unknown20(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x128x28x28xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<1x128x28x28xf16>) { + %2 = scf.for %arg3 = %c0 to %c28 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x128x28x28xf16>) { + %3 = scf.for %arg5 = %c0 to %c28 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %out: f16): + %6 = arith.maximumf %in, %cst : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x128x28x28xf16> + scf.yield %inserted_slice : tensor<1x128x28x28xf16> + } + scf.yield %3 : tensor<1x128x28x28xf16> + } + scf.yield %2 : tensor<1x128x28x28xf16> + } return %1 : tensor<1x128x28x28xf16> } func.func private @Unknown21(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x128x3x3xf16> + scf.yield %inserted_slice : tensor<128x128x3x3xf16> + } + scf.yield %4 : tensor<128x128x3x3xf16> + } + scf.yield %3 : tensor<128x128x3x3xf16> + } + scf.yield %2 : tensor<128x128x3x3xf16> + } return %1 : tensor<128x128x3x3xf16> } func.func private @Unknown23(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x128x28x28xf16> - return %1 : tensor<1x128x28x28xf16> - } - func.func private @Unknown24(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown26(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x128x28x28xf16> - return %1 : tensor<1x128x28x28xf16> - } - func.func private @Unknown27(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown29(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x128x28x28xf16> + %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<1x128x28x28xf16>) { + %2 = scf.for %arg4 = %c0 to %c28 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x128x28x28xf16>) { + %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.addf %in, %in_1 : f16 + %7 = arith.maximumf %6, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x128x28x28xf16> + scf.yield %inserted_slice : tensor<1x128x28x28xf16> + } + scf.yield %3 : tensor<1x128x28x28xf16> + } + scf.yield %2 : tensor<1x128x28x28xf16> + } return %1 : tensor<1x128x28x28xf16> } func.func private @Unknown30(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf32>) outs(%0 : tensor<256x128x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x128x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x1x1xf16> + scf.yield %inserted_slice : tensor<256x128x1x1xf16> + } + scf.yield %2 : tensor<256x128x1x1xf16> + } return %1 : tensor<256x128x1x1xf16> } func.func private @Unknown32(%arg0: tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf32>) outs(%0 : tensor<256x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x128x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x3x3xf16> + scf.yield %inserted_slice : tensor<256x128x3x3xf16> + } + scf.yield %4 : tensor<256x128x3x3xf16> + } + scf.yield %3 : tensor<256x128x3x3xf16> + } + scf.yield %2 : tensor<256x128x3x3xf16> + } return %1 : tensor<256x128x3x3xf16> } func.func private @Unknown34(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x256x14x14xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<1x256x14x14xf16>) { + %2 = scf.for %arg3 = %c0 to %c14 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x256x14x14xf16>) { + %3 = scf.for %arg5 = %c0 to %c14 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %out: f16): + %6 = arith.maximumf %in, %cst : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x256x14x14xf16> + scf.yield %inserted_slice : tensor<1x256x14x14xf16> + } + scf.yield %3 : tensor<1x256x14x14xf16> + } + scf.yield %2 : tensor<1x256x14x14xf16> + } return %1 : tensor<1x256x14x14xf16> } func.func private @Unknown35(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x256x3x3xf16> + scf.yield %inserted_slice : tensor<256x256x3x3xf16> + } + scf.yield %4 : tensor<256x256x3x3xf16> + } + scf.yield %3 : tensor<256x256x3x3xf16> + } + scf.yield %2 : tensor<256x256x3x3xf16> + } return %1 : tensor<256x256x3x3xf16> } func.func private @Unknown37(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x256x14x14xf16> - return %1 : tensor<1x256x14x14xf16> - } - func.func private @Unknown38(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown40(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x256x14x14xf16> - return %1 : tensor<1x256x14x14xf16> - } - func.func private @Unknown41(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown43(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x256x14x14xf16> + %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<1x256x14x14xf16>) { + %2 = scf.for %arg4 = %c0 to %c14 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x256x14x14xf16>) { + %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.addf %in, %in_1 : f16 + %7 = arith.maximumf %6, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x256x14x14xf16> + scf.yield %inserted_slice : tensor<1x256x14x14xf16> + } + scf.yield %3 : tensor<1x256x14x14xf16> + } + scf.yield %2 : tensor<1x256x14x14xf16> + } return %1 : tensor<1x256x14x14xf16> } func.func private @Unknown44(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf32>) outs(%0 : tensor<512x256x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x256x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x1x1xf16> + scf.yield %inserted_slice : tensor<512x256x1x1xf16> + } + scf.yield %2 : tensor<512x256x1x1xf16> + } return %1 : tensor<512x256x1x1xf16> } func.func private @Unknown46(%arg0: tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf32>) outs(%0 : tensor<512x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x256x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x3x3xf16> + scf.yield %inserted_slice : tensor<512x256x3x3xf16> + } + scf.yield %4 : tensor<512x256x3x3xf16> + } + scf.yield %3 : tensor<512x256x3x3xf16> + } + scf.yield %2 : tensor<512x256x3x3xf16> + } return %1 : tensor<512x256x3x3xf16> } func.func private @Unknown48(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x512x7x7xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<1x512x7x7xf16>) { + %2 = scf.for %arg3 = %c0 to %c7 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x512x7x7xf16>) { + %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %out: f16): + %6 = arith.maximumf %in, %cst : f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x512x7x7xf16> + scf.yield %inserted_slice : tensor<1x512x7x7xf16> + } + scf.yield %3 : tensor<1x512x7x7xf16> + } + scf.yield %2 : tensor<1x512x7x7xf16> + } return %1 : tensor<1x512x7x7xf16> } func.func private @Unknown49(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x512x3x3xf16> + scf.yield %inserted_slice : tensor<512x512x3x3xf16> + } + scf.yield %4 : tensor<512x512x3x3xf16> + } + scf.yield %3 : tensor<512x512x3x3xf16> + } + scf.yield %2 : tensor<512x512x3x3xf16> + } return %1 : tensor<512x512x3x3xf16> } func.func private @Unknown51(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x512x7x7xf16> + %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) { + %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) { + %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor + %4 = tensor.empty() : tensor + %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%4 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %6 = arith.addf %in, %in_1 : f16 + %7 = arith.maximumf %6, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<1x512x7x7xf16> + scf.yield %inserted_slice : tensor<1x512x7x7xf16> + } + scf.yield %3 : tensor<1x512x7x7xf16> + } + scf.yield %2 : tensor<1x512x7x7xf16> + } return %1 : tensor<1x512x7x7xf16> } - func.func private @Unknown52(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown54(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown58(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512xf16> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.maxnumf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x512x7x7xf16> - return %1 : tensor<1x512x7x7xf16> - } - func.func private @Unknown55(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown57(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<1x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - %3 = arith.maxnumf %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<1x512x7x7xf16> - return %1 : tensor<1x512x7x7xf16> - } - func.func private @Unknown58(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} { + %collapsed = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] : tensor<1x512x7x7xf16> into tensor<512x49xf16> + %0 = tensor.empty() : tensor<512xf16> + %1 = scf.forall (%arg1) in (512) shared_outs(%arg2 = %0) -> (tensor<512xf16>) { + %extracted_slice = tensor.extract_slice %collapsed[%arg1, 0] [1, 49] [1, 1] : tensor<512x49xf16> to tensor<49xf16> + %expanded_0 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<49xf16> into tensor<1x49xf16> + %extracted_slice_1 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<512xf16> to tensor + %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf16> + %3 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %2) -> (tensor<64xf16>) { + %15 = affine.min #map1(%arg3) + %16 = affine.min #map2(%arg3) + %17 = affine.apply #map3(%16, %15) + %extracted_slice_7 = tensor.extract_slice %expanded_0[0, %15] [1, %17] [1, 1] : tensor<1x49xf16> to tensor + %expanded_8 = tensor.expand_shape %extracted_slice_7 [[0, 1]] : tensor into tensor<1x?xf16> + %dim = tensor.dim %extracted_slice_7, %c0 : tensor + %18 = arith.cmpi ugt, %dim, %c0 : index + %19 = scf.if %18 -> (f16) { + %extracted = tensor.extract %expanded_8[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %20 = arith.addf %19, %cst : f16 + %extracted_slice_9 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor + %inserted = tensor.insert %20 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<64xf16> + } + } {mapping = [#gpu.thread]} + %expanded_2 = tensor.expand_shape %3 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16> + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf16> + %5 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf16>) { + %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<32x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_2[%arg3, %c1] : tensor<32x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf16> + } + } {mapping = [#gpu.thread]} + %expanded_3 = tensor.expand_shape %5 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf16> + %7 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %6) -> (tensor<16xf16>) { + %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<16x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_3[%arg3, %c1] : tensor<16x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<16xf16> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %7 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf16> + %9 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %8) -> (tensor<8xf16>) { + %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<8x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_4[%arg3, %c1] : tensor<8x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<8xf16> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %9 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf16> + %11 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %10) -> (tensor<4xf16>) { + %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<4x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_5[%arg3, %c1] : tensor<4x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %11 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf16> + %13 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %12) -> (tensor<2xf16>) { + %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<2x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_6[%arg3, %c1] : tensor<2x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<2xf16> + } + } {mapping = [#gpu.thread]} + %14 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_1) -> (tensor) { + %15 = affine.apply #map4(%arg3) + %extracted = tensor.extract %13[%15] : tensor<2xf16> + %16 = arith.addf %extracted, %cst : f16 + %17 = affine.apply #map5(%arg3) + %extracted_7 = tensor.extract %13[%17] : tensor<2xf16> + %18 = arith.addf %extracted_7, %16 : f16 + %inserted = tensor.insert %18 into %arg4[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %14 into %arg2[%arg1] [1] [1] : tensor into tensor<512xf16> + } + } {mapping = [#gpu.block]} + %expanded = tensor.expand_shape %1 [[0, 1]] : tensor<512xf16> into tensor<1x512xf16> + return %expanded : tensor<1x512xf16> + } + func.func private @Unknown59(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 2.040100e-02 : f16 %0 = tensor.empty() : tensor<1x512xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x512xf16>) outs(%0 : tensor<1x512xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.mulf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<1x512xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<1x512xf16>) { + %extracted_slice = tensor.extract_slice %arg0[0, %arg1] [1, 1] [1, 1] : tensor<1x512xf16> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%2 : tensor) { + ^bb0(%in: f16, %out: f16): + %4 = arith.mulf %in, %cst : f16 + linalg.yield %4 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg2[0, %arg1] [1, 1] [1, 1] : tensor into tensor<1x512xf16> + scf.yield %inserted_slice : tensor<1x512xf16> + } return %1 : tensor<1x512xf16> } - func.func private @Unknown59(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown60(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1000x512xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf32>) outs(%0 : tensor<1000x512xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<1000x512xf16> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf16>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<1000x512xf16> + scf.yield %inserted_slice : tensor<1000x512xf16> + } + scf.yield %2 : tensor<1000x512xf16> + } return %1 : tensor<1000x512xf16> } - func.func private @Unknown60(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} { - %expanded = tensor.expand_shape %arg0 [[0, 1]] : tensor<1000xf32> into tensor<1x1000xf32> + func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1x1000xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %expanded : tensor<1x1000xf16>, tensor<1x1000xf32>) outs(%0 : tensor<1x1000xf16>) { - ^bb0(%in: f16, %in_0: f32, %out: f16): - %2 = arith.truncf %in_0 : f32 to f16 - %3 = arith.addf %in, %2 : f16 - linalg.yield %3 : f16 - } -> tensor<1x1000xf16> + %1 = scf.for %arg2 = %c0 to %c1000 step %c1 iter_args(%arg3 = %0) -> (tensor<1x1000xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<1000xf32> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2] [1, 1] [1, 1] : tensor<1x1000xf16> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %in_1: f16, %out: f16): + %4 = arith.truncf %in : f32 to f16 + %5 = arith.addf %in_1, %4 : f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg3[0, %arg2] [1, 1] [1, 1] : tensor into tensor<1x1000xf16> + scf.yield %inserted_slice : tensor<1x1000xf16> + } return %1 : tensor<1x1000xf16> } - func.func private @Unknown61(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } func.func private @Unknown62(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown63(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown64(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown65(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown66(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown67(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown68(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown69(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> + %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<64xf32>) { + %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<64xf32> to tensor + %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<64xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %4 = arith.mulf %in, %cst : f32 + %5 = arith.mulf %in_2, %cst_0 : f32 + %6 = arith.addf %5, %4 : f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor into tensor<64xf32> + scf.yield %inserted_slice : tensor<64xf32> + } return %1 : tensor<64xf32> } - func.func private @Unknown70(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<64xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<64xf32> - return %1 : tensor<64xf32> - } - func.func private @Unknown71(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } func.func private @Unknown72(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown73(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> + %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<128xf32>) { + %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<128xf32> to tensor + %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<128xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %4 = arith.mulf %in, %cst : f32 + %5 = arith.mulf %in_2, %cst_0 : f32 + %6 = arith.addf %5, %4 : f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor into tensor<128xf32> + scf.yield %inserted_slice : tensor<128xf32> + } return %1 : tensor<128xf32> } - func.func private @Unknown74(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown75(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown76(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown77(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown78(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown79(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown80(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<128xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<128xf32> - return %1 : tensor<128xf32> - } - func.func private @Unknown81(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } func.func private @Unknown82(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown83(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown84(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown85(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown86(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> + %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<256xf32>) { + %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<256xf32> to tensor + %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<256xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %4 = arith.mulf %in, %cst : f32 + %5 = arith.mulf %in_2, %cst_0 : f32 + %6 = arith.addf %5, %4 : f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor into tensor<256xf32> + scf.yield %inserted_slice : tensor<256xf32> + } return %1 : tensor<256xf32> } - func.func private @Unknown87(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown88(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown89(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown90(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<256xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<256xf32> - return %1 : tensor<256xf32> - } - func.func private @Unknown91(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } func.func private @Unknown92(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown93(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown94(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown95(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown96(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown97(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown98(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown99(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> - return %1 : tensor<512xf32> - } - func.func private @Unknown100(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %0 = tensor.empty() : tensor<512xf32> - %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %2 = arith.mulf %in_1, %cst : f32 - %3 = arith.mulf %in, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - linalg.yield %4 : f32 - } -> tensor<512xf32> + %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<512xf32>) { + %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<512xf32> to tensor + %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<512xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %4 = arith.mulf %in, %cst : f32 + %5 = arith.mulf %in_2, %cst_0 : f32 + %6 = arith.addf %5, %4 : f32 + linalg.yield %6 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor into tensor<512xf32> + scf.yield %inserted_slice : tensor<512xf32> + } return %1 : tensor<512xf32> } func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<1000xf32>, %arg4: tensor<1000x512xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64x64x3x3xf32>, %arg10: tensor<64x64x3x3xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64x64x3x3xf32>, %arg16: tensor<64x64x3x3xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<128xf32>, %arg21: tensor<128x64x3x3xf32>, %arg22: tensor<128x128x3x3xf32>, %arg23: tensor<128x64x1x1xf32>, %arg24: tensor<128xf32>, %arg25: tensor<128xf32>, %arg26: tensor<128xf32>, %arg27: tensor<128xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128x128x3x3xf32>, %arg31: tensor<128x128x3x3xf32>, %arg32: tensor<256xf32>, %arg33: tensor<256xf32>, %arg34: tensor<256xf32>, %arg35: tensor<256xf32>, %arg36: tensor<256x128x3x3xf32>, %arg37: tensor<256x256x3x3xf32>, %arg38: tensor<256x128x1x1xf32>, %arg39: tensor<256xf32>, %arg40: tensor<256xf32>, %arg41: tensor<256xf32>, %arg42: tensor<256xf32>, %arg43: tensor<256xf32>, %arg44: tensor<256xf32>, %arg45: tensor<256x256x3x3xf32>, %arg46: tensor<256x256x3x3xf32>, %arg47: tensor<512xf32>, %arg48: tensor<512xf32>, %arg49: tensor<512xf32>, %arg50: tensor<512xf32>, %arg51: tensor<512x256x3x3xf32>, %arg52: tensor<512x512x3x3xf32>, %arg53: tensor<512x256x1x1xf32>, %arg54: tensor<512xf32>, %arg55: tensor<512xf32>, %arg56: tensor<512xf32>, %arg57: tensor<512xf32>, %arg58: tensor<512xf32>, %arg59: tensor<512xf32>, %arg60: tensor<512x512x3x3xf32>, %arg61: tensor<512x512x3x3xf32>, %arg62: tensor, %arg63: tensor<64xf32>, %arg64: tensor<64xf32>, %arg65: tensor, %arg66: tensor<64xf32>, %arg67: tensor<64xf32>, %arg68: tensor, %arg69: tensor<64xf32>, %arg70: tensor<64xf32>, %arg71: tensor, %arg72: tensor<64xf32>, %arg73: tensor<64xf32>, %arg74: tensor, %arg75: tensor<64xf32>, %arg76: tensor<64xf32>, %arg77: tensor, %arg78: tensor<128xf32>, %arg79: tensor<128xf32>, %arg80: tensor, %arg81: tensor<128xf32>, %arg82: tensor<128xf32>, %arg83: tensor, %arg84: tensor<128xf32>, %arg85: tensor<128xf32>, %arg86: tensor, %arg87: tensor<128xf32>, %arg88: tensor<128xf32>, %arg89: tensor, %arg90: tensor<128xf32>, %arg91: tensor<128xf32>, %arg92: tensor, %arg93: tensor<256xf32>, %arg94: tensor<256xf32>, %arg95: tensor, %arg96: tensor<256xf32>, %arg97: tensor<256xf32>, %arg98: tensor, %arg99: tensor<256xf32>, %arg100: tensor<256xf32>, %arg101: tensor, %arg102: tensor<256xf32>, %arg103: tensor<256xf32>, %arg104: tensor, %arg105: tensor<256xf32>, %arg106: tensor<256xf32>, %arg107: tensor, %arg108: tensor<512xf32>, %arg109: tensor<512xf32>, %arg110: tensor, %arg111: tensor<512xf32>, %arg112: tensor<512xf32>, %arg113: tensor, %arg114: tensor<512xf32>, %arg115: tensor<512xf32>, %arg116: tensor, %arg117: tensor<512xf32>, %arg118: tensor<512xf32>, %arg119: tensor, %arg120: tensor<512xf32>, %arg121: tensor<512xf32>, %arg122: tensor<1x3x224x224xf32>) -> (tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>) attributes {__placeholder__byre.entry_point} { @@ -943,7 +859,7 @@ module { %16 = tensor.empty() : tensor<64xf32> %17:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%13, %arg6, %arg5 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%14, %15, %16 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> %18 = call @Unknown6(%17#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %19 = call @Unknown7(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %19 = call @Unknown4(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> %20 = tensor.empty() : tensor<1x64x56x56xf16> %21 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%18, %19 : tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%20 : tensor<1x64x56x56xf16>) : tensor<1x64x56x56xf16> %22 = tensor.empty() : tensor<1x64x56x56xf16> @@ -951,22 +867,22 @@ module { %24 = tensor.empty() : tensor<64xf32> %25:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%21, %arg8, %arg7 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%22, %23, %24 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> %26 = call @Unknown9(%25#0, %10) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %27 = call @Unknown10(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %27 = call @Unknown4(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> %28 = tensor.empty() : tensor<1x64x56x56xf16> %29 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%26, %27 : tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%28 : tensor<1x64x56x56xf16>) : tensor<1x64x56x56xf16> %30 = tensor.empty() : tensor<1x64x56x56xf16> %31 = tensor.empty() : tensor<64xf32> %32 = tensor.empty() : tensor<64xf32> %33:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%29, %arg12, %arg11 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%30, %31, %32 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - %34 = call @Unknown12(%33#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> - %35 = call @Unknown13(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %34 = call @Unknown6(%33#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %35 = call @Unknown4(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> %36 = tensor.empty() : tensor<1x64x56x56xf16> %37 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%34, %35 : tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%36 : tensor<1x64x56x56xf16>) : tensor<1x64x56x56xf16> %38 = tensor.empty() : tensor<1x64x56x56xf16> %39 = tensor.empty() : tensor<64xf32> %40 = tensor.empty() : tensor<64xf32> %41:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%37, %arg14, %arg13 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%38, %39, %40 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - %42 = call @Unknown15(%41#0, %26) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> + %42 = call @Unknown9(%41#0, %26) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> %43 = call @Unknown16(%arg23) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> %44 = tensor.empty() : tensor<1x128x28x28xf16> %45 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%42, %43 : tensor<1x64x56x56xf16>, tensor<128x64x1x1xf16>) outs(%44 : tensor<1x128x28x28xf16>) : tensor<1x128x28x28xf16> @@ -990,22 +906,22 @@ module { %63 = tensor.empty() : tensor<128xf32> %64:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%60, %arg20, %arg19 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%61, %62, %63 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> %65 = call @Unknown23(%64#0, %49#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %66 = call @Unknown24(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %66 = call @Unknown21(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> %67 = tensor.empty() : tensor<1x128x28x28xf16> %68 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%65, %66 : tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%67 : tensor<1x128x28x28xf16>) : tensor<1x128x28x28xf16> %69 = tensor.empty() : tensor<1x128x28x28xf16> %70 = tensor.empty() : tensor<128xf32> %71 = tensor.empty() : tensor<128xf32> %72:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%68, %arg27, %arg26 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%69, %70, %71 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - %73 = call @Unknown26(%72#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> - %74 = call @Unknown27(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %73 = call @Unknown20(%72#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %74 = call @Unknown21(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> %75 = tensor.empty() : tensor<1x128x28x28xf16> %76 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%73, %74 : tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%75 : tensor<1x128x28x28xf16>) : tensor<1x128x28x28xf16> %77 = tensor.empty() : tensor<1x128x28x28xf16> %78 = tensor.empty() : tensor<128xf32> %79 = tensor.empty() : tensor<128xf32> %80:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%76, %arg29, %arg28 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%77, %78, %79 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - %81 = call @Unknown29(%80#0, %65) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> + %81 = call @Unknown23(%80#0, %65) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> %82 = call @Unknown30(%arg38) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> %83 = tensor.empty() : tensor<1x256x14x14xf16> %84 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%81, %82 : tensor<1x128x28x28xf16>, tensor<256x128x1x1xf16>) outs(%83 : tensor<1x256x14x14xf16>) : tensor<1x256x14x14xf16> @@ -1029,22 +945,22 @@ module { %102 = tensor.empty() : tensor<256xf32> %103:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%99, %arg35, %arg34 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%100, %101, %102 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> %104 = call @Unknown37(%103#0, %88#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %105 = call @Unknown38(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %105 = call @Unknown35(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> %106 = tensor.empty() : tensor<1x256x14x14xf16> %107 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%104, %105 : tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%106 : tensor<1x256x14x14xf16>) : tensor<1x256x14x14xf16> %108 = tensor.empty() : tensor<1x256x14x14xf16> %109 = tensor.empty() : tensor<256xf32> %110 = tensor.empty() : tensor<256xf32> %111:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%107, %arg42, %arg41 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%108, %109, %110 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - %112 = call @Unknown40(%111#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> - %113 = call @Unknown41(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %112 = call @Unknown34(%111#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %113 = call @Unknown35(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> %114 = tensor.empty() : tensor<1x256x14x14xf16> %115 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%112, %113 : tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%114 : tensor<1x256x14x14xf16>) : tensor<1x256x14x14xf16> %116 = tensor.empty() : tensor<1x256x14x14xf16> %117 = tensor.empty() : tensor<256xf32> %118 = tensor.empty() : tensor<256xf32> %119:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%115, %arg44, %arg43 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%116, %117, %118 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - %120 = call @Unknown43(%119#0, %104) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> + %120 = call @Unknown37(%119#0, %104) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> %121 = call @Unknown44(%arg53) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> %122 = tensor.empty() : tensor<1x512x7x7xf16> %123 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%120, %121 : tensor<1x256x14x14xf16>, tensor<512x256x1x1xf16>) outs(%122 : tensor<1x512x7x7xf16>) : tensor<1x512x7x7xf16> @@ -1068,71 +984,70 @@ module { %141 = tensor.empty() : tensor<512xf32> %142:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%138, %arg50, %arg49 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%139, %140, %141 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> %143 = call @Unknown51(%142#0, %127#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %144 = call @Unknown52(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %144 = call @Unknown49(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> %145 = tensor.empty() : tensor<1x512x7x7xf16> %146 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%143, %144 : tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%145 : tensor<1x512x7x7xf16>) : tensor<1x512x7x7xf16> %147 = tensor.empty() : tensor<1x512x7x7xf16> %148 = tensor.empty() : tensor<512xf32> %149 = tensor.empty() : tensor<512xf32> %150:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%146, %arg57, %arg56 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%147, %148, %149 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - %151 = call @Unknown54(%150#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %152 = call @Unknown55(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %151 = call @Unknown48(%150#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %152 = call @Unknown49(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> %153 = tensor.empty() : tensor<1x512x7x7xf16> %154 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%151, %152 : tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%153 : tensor<1x512x7x7xf16>) : tensor<1x512x7x7xf16> %155 = tensor.empty() : tensor<1x512x7x7xf16> %156 = tensor.empty() : tensor<512xf32> %157 = tensor.empty() : tensor<512xf32> %158:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%154, %arg59, %arg58 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%155, %156, %157 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - %159 = call @Unknown57(%158#0, %143) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> - %160 = tensor.empty() : tensor<1x512xf16> - %161 = byre.compute_on_tensor @ReduceSumOp_f16_f16 {dimensions = dense<[3, 2]> : tensor<2xi64>} ins(%159 : tensor<1x512x7x7xf16>) outs(%160 : tensor<1x512xf16>) : tensor<1x512xf16> - %162 = call @Unknown58(%161) : (tensor<1x512xf16>) -> tensor<1x512xf16> - %163 = call @Unknown59(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> - %164 = tensor.empty() : tensor<512x1000xf16> - %165 = byre.compute_on_tensor @TransposeOp_f16_f16 {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} ins(%163 : tensor<1000x512xf16>) outs(%164 : tensor<512x1000xf16>) : tensor<512x1000xf16> - %166 = tensor.empty() : tensor<1x1000xf16> - %167 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%162, %163 : tensor<1x512xf16>, tensor<1000x512xf16>) outs(%166 : tensor<1x1000xf16>) : tensor<1x1000xf16> - %168 = call @Unknown60(%arg3, %167) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16> - %169 = call @Unknown61(%7#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %170 = call @Unknown62(%7#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %171 = call @Unknown63(%17#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %172 = call @Unknown64(%17#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %173 = call @Unknown65(%25#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %174 = call @Unknown66(%25#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %175 = call @Unknown67(%33#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %176 = call @Unknown68(%33#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %177 = call @Unknown69(%41#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %178 = call @Unknown70(%41#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> - %179 = call @Unknown71(%56#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %180 = call @Unknown72(%56#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %181 = call @Unknown73(%64#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %182 = call @Unknown74(%64#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %183 = call @Unknown75(%49#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %184 = call @Unknown76(%49#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %185 = call @Unknown77(%72#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %186 = call @Unknown78(%72#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %187 = call @Unknown79(%80#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %188 = call @Unknown80(%80#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> - %189 = call @Unknown81(%95#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %190 = call @Unknown82(%95#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %191 = call @Unknown83(%103#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %192 = call @Unknown84(%103#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %193 = call @Unknown85(%88#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %194 = call @Unknown86(%88#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %195 = call @Unknown87(%111#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %196 = call @Unknown88(%111#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %197 = call @Unknown89(%119#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %198 = call @Unknown90(%119#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> - %199 = call @Unknown91(%134#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %200 = call @Unknown92(%134#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %201 = call @Unknown93(%142#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %202 = call @Unknown94(%142#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %203 = call @Unknown95(%127#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %204 = call @Unknown96(%127#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %205 = call @Unknown97(%150#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %206 = call @Unknown98(%150#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %207 = call @Unknown99(%158#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - %208 = call @Unknown100(%158#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> - return %168, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %169, %170, %171, %172, %173, %174, %175, %176, %177, %178, %179, %180, %181, %182, %183, %184, %185, %186, %187, %188, %189, %190, %191, %192, %193, %194, %195, %196, %197, %198, %199, %200, %201, %202, %203, %204, %205, %206, %207, %208, %1, %0, %3, %8, %10, %11, %13, %18, %19, %21, %26, %27, %29, %34, %35, %37, %42, %50, %52, %57, %58, %60, %43, %45, %65, %66, %68, %73, %74, %76, %81, %89, %91, %96, %97, %99, %82, %84, %104, %105, %107, %112, %113, %115, %120, %128, %130, %135, %136, %138, %121, %123, %143, %144, %146, %151, %152, %154, %159, %162, %165 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16> - } -} + %159 = call @Unknown51(%158#0, %143) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> + %160 = call @Unknown58(%159) : (tensor<1x512x7x7xf16>) -> tensor<1x512xf16> + %161 = call @Unknown59(%160) : (tensor<1x512xf16>) -> tensor<1x512xf16> + %162 = call @Unknown60(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> + %163 = tensor.empty() : tensor<512x1000xf16> + %164 = byre.compute_on_tensor @TransposeOp_f16_f16 {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} ins(%162 : tensor<1000x512xf16>) outs(%163 : tensor<512x1000xf16>) : tensor<512x1000xf16> + %165 = tensor.empty() : tensor<1x1000xf16> + %166 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%161, %162 : tensor<1x512xf16>, tensor<1000x512xf16>) outs(%165 : tensor<1x1000xf16>) : tensor<1x1000xf16> + %167 = call @Unknown61(%arg3, %166) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16> + %168 = call @Unknown62(%7#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %169 = call @Unknown62(%7#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %170 = call @Unknown62(%17#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %171 = call @Unknown62(%17#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %172 = call @Unknown62(%25#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %173 = call @Unknown62(%25#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %174 = call @Unknown62(%33#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %175 = call @Unknown62(%33#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %176 = call @Unknown62(%41#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %177 = call @Unknown62(%41#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32> + %178 = call @Unknown72(%56#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %179 = call @Unknown72(%56#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %180 = call @Unknown72(%64#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %181 = call @Unknown72(%64#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %182 = call @Unknown72(%49#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %183 = call @Unknown72(%49#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %184 = call @Unknown72(%72#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %185 = call @Unknown72(%72#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %186 = call @Unknown72(%80#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %187 = call @Unknown72(%80#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> + %188 = call @Unknown82(%95#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %189 = call @Unknown82(%95#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %190 = call @Unknown82(%103#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %191 = call @Unknown82(%103#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %192 = call @Unknown82(%88#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %193 = call @Unknown82(%88#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %194 = call @Unknown82(%111#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %195 = call @Unknown82(%111#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %196 = call @Unknown82(%119#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %197 = call @Unknown82(%119#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32> + %198 = call @Unknown92(%134#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %199 = call @Unknown92(%134#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %200 = call @Unknown92(%142#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %201 = call @Unknown92(%142#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %202 = call @Unknown92(%127#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %203 = call @Unknown92(%127#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %204 = call @Unknown92(%150#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %205 = call @Unknown92(%150#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %206 = call @Unknown92(%158#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + %207 = call @Unknown92(%158#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32> + return %167, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %168, %169, %170, %171, %172, %173, %174, %175, %176, %177, %178, %179, %180, %181, %182, %183, %184, %185, %186, %187, %188, %189, %190, %191, %192, %193, %194, %195, %196, %197, %198, %199, %200, %201, %202, %203, %204, %205, %206, %207, %1, %0, %3, %8, %10, %11, %13, %18, %19, %21, %26, %27, %29, %34, %35, %37, %42, %50, %52, %57, %58, %60, %43, %45, %65, %66, %68, %73, %74, %76, %81, %89, %91, %96, %97, %99, %82, %84, %104, %105, %107, %112, %113, %115, %120, %128, %130, %135, %136, %138, %121, %123, %143, %144, %146, %151, %152, %154, %159, %161, %164 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16> + } +} \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/FW/5_affine_opt.mlir b/compiler/test/E2E/ResNet18/FW/5_affine_opt.mlir index f1e1e5d69..a78a25416 100644 --- a/compiler/test/E2E/ResNet18/FW/5_affine_opt.mlir +++ b/compiler/test/E2E/ResNet18/FW/5_affine_opt.mlir @@ -2,924 +2,708 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1) -> (d0, d1)> -#map2 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> +#map1 = affine_map<(d0) -> (d0 mod 64, 49)> +#map2 = affine_map<(d0) -> (d0 mod 64 + 1, 49)> +#map3 = affine_map<(d0, d1) -> (d0 - d1)> +#map4 = affine_map<(d0) -> (d0 * 2)> +#map5 = affine_map<(d0) -> (d0 * 2 + 1)> module { func.func private @Unknown0(%arg0: memref<1x3x224x224xf32>) -> memref<1x3x224x224xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c224 = arith.constant 224 : index %alloc = memref.alloc() : memref<1x3x224x224xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x3x224x224xf32>) outs(%alloc : memref<1x3x224x224xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c3 step %c1 { + scf.for %arg2 = %c0 to %c224 step %c1 { + scf.for %arg3 = %c0 to %c224 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x3x224x224xf32> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x3x224x224xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x3x224x224xf16> } func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<64x3x7x7xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf32>) outs(%alloc : memref<64x3x7x7xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c3 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<64x3x7x7xf16> } func.func private @Unknown3(%arg0: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %alloc = memref.alloc() : memref<1x64x112x112xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x112x112xf16>) outs(%alloc : memref<1x64x112x112xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c112 step %c1 { + scf.for %arg3 = %c0 to %c112 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.maximumf %in, %cst : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x64x112x112xf16> } func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<64x64x3x3xf16> } func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c56 step %c1 { + scf.for %arg3 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.maximumf %in, %cst : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x64x56x56xf16> } - func.func private @Unknown7(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown10(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown12(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown13(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c56 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + %1 = arith.maximumf %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown16(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<128x64x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf32>) outs(%alloc : memref<128x64x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<128x64x1x1xf16> } func.func private @Unknown18(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf32>) outs(%alloc : memref<128x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<128x64x3x3xf16> } func.func private @Unknown20(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c28 step %c1 { + scf.for %arg3 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.maximumf %in, %cst : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown21(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<128x128x3x3xf16> } func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown24(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown26(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown27(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c28 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + %1 = arith.maximumf %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown30(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<256x128x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf32>) outs(%alloc : memref<256x128x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<256x128x1x1xf16> } func.func private @Unknown32(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf32>) outs(%alloc : memref<256x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<256x128x3x3xf16> } func.func private @Unknown34(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c14 step %c1 { + scf.for %arg3 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.maximumf %in, %cst : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown35(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<256x256x3x3xf16> } func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown38(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown40(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown41(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c14 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + %1 = arith.maximumf %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown44(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf32>) outs(%alloc : memref<512x256x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<512x256x1x1xf16> } func.func private @Unknown46(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf32>) outs(%alloc : memref<512x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<512x256x3x3xf16> } func.func private @Unknown48(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c7 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.maximumf %in, %cst : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown49(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<512x512x3x3xf16> } func.func private @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown52(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown54(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + %1 = arith.maximumf %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x512x7x7xf16> } - func.func private @Unknown55(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown58(%arg0: memref<1x512x7x7xf16>) -> memref<1x512xf16> attributes {__byteir_reduction_fusion__} { %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown58(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<1x512x7x7xf16> into memref<512x49xf16> + %alloc = memref.alloc() : memref<512xf16> + scf.forall (%arg1) in (512) { + %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = affine.min #map1(%arg2) + %1 = affine.min #map2(%arg2) + %2 = affine.apply #map3(%1, %0) + %subview_6 = memref.subview %expand_shape_0[0, %0] [1, %2] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4 = scf.if %3 -> (f16) { + %6 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %6 : f16 + } else { + scf.yield %cst : f16 + } + %5 = arith.addf %4, %cst : f16 + memref.store %5, %alloca[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<512xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<512xf16> into memref<1x512xf16> + return %expand_shape : memref<1x512xf16> + } + func.func private @Unknown59(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 2.040100e-02 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<1x512xf16> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1x512xf16>) outs(%alloc : memref<1x512xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.mulf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[0, %arg1] [1, 1] [1, 1] : memref<1x512xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1] [1, 1] [1, 1] : memref<1x512xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.mulf %in, %cst : f16 + linalg.yield %0 : f16 + } } return %alloc : memref<1x512xf16> } - func.func private @Unknown59(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown60(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<1000x512xf16> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf32>) outs(%alloc : memref<1000x512xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c1000 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<1000x512xf16> } - func.func private @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} { - %expand_shape = memref.expand_shape %arg0 [[0, 1]] : memref<1000xf32> into memref<1x1000xf32> + func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<1x1000xf16> - linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %expand_shape : memref<1x1000xf16>, memref<1x1000xf32>) outs(%alloc : memref<1x1000xf16>) { - ^bb0(%in: f16, %in_0: f32, %out: f16): - %0 = arith.truncf %in_0 : f32 to f16 - %1 = arith.addf %in, %0 : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg2] [1] [1] : memref<1000xf32> to memref> + %subview_0 = memref.subview %alloc[0, %arg2] [1, 1] [1, 1] : memref<1x1000xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2] [1, 1] [1, 1] : memref<1x1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %in_2: f16, %out: f16): + %0 = arith.truncf %in : f32 to f16 + %1 = arith.addf %in_2, %0 : f16 + linalg.yield %1 : f16 + } } return %alloc : memref<1x1000xf16> } - func.func private @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } func.func private @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 1.000000e-01 : f32 %cst_0 = arith.constant 0.899999976 : f32 + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 + scf.for %arg2 = %c0 to %c64 step %c1 { + %subview = memref.subview %arg1[%arg2] [1] [1] : memref<64xf32> to memref> + %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<64xf32> to memref> + %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<64xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f32, %in_3: f32, %out: f32): + %0 = arith.mulf %in, %cst_0 : f32 + %1 = arith.mulf %in_3, %cst : f32 + %2 = arith.addf %1, %0 : f32 + linalg.yield %2 : f32 + } } return %alloc : memref<64xf32> } - func.func private @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } func.func private @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 1.000000e-01 : f32 %cst_0 = arith.constant 0.899999976 : f32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 + scf.for %arg2 = %c0 to %c128 step %c1 { + %subview = memref.subview %arg1[%arg2] [1] [1] : memref<128xf32> to memref> + %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<128xf32> to memref> + %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<128xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f32, %in_3: f32, %out: f32): + %0 = arith.mulf %in, %cst_0 : f32 + %1 = arith.mulf %in_3, %cst : f32 + %2 = arith.addf %1, %0 : f32 + linalg.yield %2 : f32 + } } return %alloc : memref<128xf32> } - func.func private @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } func.func private @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 1.000000e-01 : f32 %cst_0 = arith.constant 0.899999976 : f32 + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 + scf.for %arg2 = %c0 to %c256 step %c1 { + %subview = memref.subview %arg1[%arg2] [1] [1] : memref<256xf32> to memref> + %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<256xf32> to memref> + %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<256xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f32, %in_3: f32, %out: f32): + %0 = arith.mulf %in, %cst_0 : f32 + %1 = arith.mulf %in_3, %cst : f32 + %2 = arith.addf %1, %0 : f32 + linalg.yield %2 : f32 + } } return %alloc : memref<256xf32> } - func.func private @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } func.func private @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 1.000000e-01 : f32 %cst_0 = arith.constant 0.899999976 : f32 + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg1[%arg2] [1] [1] : memref<512xf32> to memref> + %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<512xf32> to memref> + %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<512xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f32, %in_3: f32, %out: f32): + %0 = arith.mulf %in, %cst_0 : f32 + %1 = arith.mulf %in_3, %cst : f32 + %2 = arith.addf %1, %0 : f32 + linalg.yield %2 : f32 + } } return %alloc : memref<512xf32> } @@ -943,7 +727,7 @@ module { %alloc_7 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_4, %arg6, %arg5, %alloc_5, %alloc_6, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> %4 = call @Unknown6(%alloc_5) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %5 = call @Unknown7(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %5 = call @Unknown4(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_8 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%4, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_9 = memref.alloc() : memref<1x64x56x56xf16> @@ -951,22 +735,22 @@ module { %alloc_11 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_8, %arg8, %arg7, %alloc_9, %alloc_10, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> %6 = call @Unknown9(%alloc_9, %alloc_3) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %7 = call @Unknown10(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %7 = call @Unknown4(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_12 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%6, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_13 = memref.alloc() : memref<1x64x56x56xf16> %alloc_14 = memref.alloc() : memref<64xf32> %alloc_15 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_12, %arg12, %arg11, %alloc_13, %alloc_14, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %8 = call @Unknown12(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %9 = call @Unknown13(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %8 = call @Unknown6(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %9 = call @Unknown4(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_16 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%8, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_17 = memref.alloc() : memref<1x64x56x56xf16> %alloc_18 = memref.alloc() : memref<64xf32> %alloc_19 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_16, %arg14, %arg13, %alloc_17, %alloc_18, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %10 = call @Unknown15(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %10 = call @Unknown9(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %11 = call @Unknown16(%arg23) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> %alloc_20 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%10, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16> @@ -990,22 +774,22 @@ module { %alloc_31 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_28, %arg20, %arg19, %alloc_29, %alloc_30, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> %15 = call @Unknown23(%alloc_29, %alloc_21) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> - %16 = call @Unknown24(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %16 = call @Unknown21(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %alloc_32 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%15, %16, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_33 = memref.alloc() : memref<1x128x28x28xf16> %alloc_34 = memref.alloc() : memref<128xf32> %alloc_35 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_32, %arg27, %arg26, %alloc_33, %alloc_34, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %17 = call @Unknown26(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> - %18 = call @Unknown27(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %17 = call @Unknown20(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %18 = call @Unknown21(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %alloc_36 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%17, %18, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_37 = memref.alloc() : memref<1x128x28x28xf16> %alloc_38 = memref.alloc() : memref<128xf32> %alloc_39 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_36, %arg29, %arg28, %alloc_37, %alloc_38, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %19 = call @Unknown29(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %19 = call @Unknown23(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %20 = call @Unknown30(%arg38) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> %alloc_40 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%19, %20, %alloc_40) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16> @@ -1029,22 +813,22 @@ module { %alloc_51 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_48, %arg35, %arg34, %alloc_49, %alloc_50, %alloc_51) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> %24 = call @Unknown37(%alloc_49, %alloc_41) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> - %25 = call @Unknown38(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %25 = call @Unknown35(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %alloc_52 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%24, %25, %alloc_52) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_53 = memref.alloc() : memref<1x256x14x14xf16> %alloc_54 = memref.alloc() : memref<256xf32> %alloc_55 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_52, %arg42, %arg41, %alloc_53, %alloc_54, %alloc_55) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %26 = call @Unknown40(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> - %27 = call @Unknown41(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %26 = call @Unknown34(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %27 = call @Unknown35(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %alloc_56 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%26, %27, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_57 = memref.alloc() : memref<1x256x14x14xf16> %alloc_58 = memref.alloc() : memref<256xf32> %alloc_59 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_56, %arg44, %arg43, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %28 = call @Unknown43(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %28 = call @Unknown37(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %29 = call @Unknown44(%arg53) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> %alloc_60 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%28, %29, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16> @@ -1068,71 +852,70 @@ module { %alloc_71 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_68, %arg50, %arg49, %alloc_69, %alloc_70, %alloc_71) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> %33 = call @Unknown51(%alloc_69, %alloc_61) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %34 = call @Unknown52(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %34 = call @Unknown49(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %alloc_72 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%33, %34, %alloc_72) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_73 = memref.alloc() : memref<1x512x7x7xf16> %alloc_74 = memref.alloc() : memref<512xf32> %alloc_75 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_72, %arg57, %arg56, %alloc_73, %alloc_74, %alloc_75) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %35 = call @Unknown54(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %36 = call @Unknown55(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %35 = call @Unknown48(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %36 = call @Unknown49(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %alloc_76 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%35, %36, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_77 = memref.alloc() : memref<1x512x7x7xf16> %alloc_78 = memref.alloc() : memref<512xf32> %alloc_79 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_76, %arg59, %arg58, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %37 = call @Unknown57(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %alloc_80 = memref.alloc() : memref<1x512xf16> - byre.compute @ReduceSumOp_f16_f16(%37, %alloc_80) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<1x512xf16> - %38 = call @Unknown58(%alloc_80) : (memref<1x512xf16>) -> memref<1x512xf16> - %39 = call @Unknown59(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16> - %alloc_81 = memref.alloc() : memref<512x1000xf16> - byre.compute @TransposeOp_f16_f16(%39, %alloc_81) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16> - %alloc_82 = memref.alloc() : memref<1x1000xf16> - byre.compute @MatmulOp_f16f16_f16(%38, %39, %alloc_82) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16> - %40 = call @Unknown60(%arg3, %alloc_82) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16> - %41 = call @Unknown61(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %42 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %43 = call @Unknown63(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %44 = call @Unknown64(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %45 = call @Unknown65(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %46 = call @Unknown66(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %47 = call @Unknown67(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %48 = call @Unknown68(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %49 = call @Unknown69(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %50 = call @Unknown70(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %51 = call @Unknown71(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %52 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %53 = call @Unknown73(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %54 = call @Unknown74(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %55 = call @Unknown75(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %56 = call @Unknown76(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %57 = call @Unknown77(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %58 = call @Unknown78(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %59 = call @Unknown79(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %60 = call @Unknown80(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %61 = call @Unknown81(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %62 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %63 = call @Unknown83(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %64 = call @Unknown84(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %65 = call @Unknown85(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %66 = call @Unknown86(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %67 = call @Unknown87(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %68 = call @Unknown88(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %69 = call @Unknown89(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %70 = call @Unknown90(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %71 = call @Unknown91(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %72 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %73 = call @Unknown93(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %74 = call @Unknown94(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %75 = call @Unknown95(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %76 = call @Unknown96(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %77 = call @Unknown97(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %78 = call @Unknown98(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %79 = call @Unknown99(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %80 = call @Unknown100(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - return %40, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %38, %alloc_81 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16> + %37 = call @Unknown51(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %38 = call @Unknown58(%37) : (memref<1x512x7x7xf16>) -> memref<1x512xf16> + %39 = call @Unknown59(%38) : (memref<1x512xf16>) -> memref<1x512xf16> + %40 = call @Unknown60(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16> + %alloc_80 = memref.alloc() : memref<512x1000xf16> + byre.compute @TransposeOp_f16_f16(%40, %alloc_80) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16> + %alloc_81 = memref.alloc() : memref<1x1000xf16> + byre.compute @MatmulOp_f16f16_f16(%39, %40, %alloc_81) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16> + %41 = call @Unknown61(%arg3, %alloc_81) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16> + %42 = call @Unknown62(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %43 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %44 = call @Unknown62(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %45 = call @Unknown62(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %46 = call @Unknown62(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %47 = call @Unknown62(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %48 = call @Unknown62(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %49 = call @Unknown62(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %50 = call @Unknown62(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %51 = call @Unknown62(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %52 = call @Unknown72(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %53 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %54 = call @Unknown72(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %55 = call @Unknown72(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %56 = call @Unknown72(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %57 = call @Unknown72(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %58 = call @Unknown72(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %59 = call @Unknown72(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %60 = call @Unknown72(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %61 = call @Unknown72(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %62 = call @Unknown82(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %63 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %64 = call @Unknown82(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %65 = call @Unknown82(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %66 = call @Unknown82(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %67 = call @Unknown82(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %68 = call @Unknown82(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %69 = call @Unknown82(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %70 = call @Unknown82(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %71 = call @Unknown82(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %72 = call @Unknown92(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %73 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %74 = call @Unknown92(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %75 = call @Unknown92(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %76 = call @Unknown92(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %77 = call @Unknown92(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %78 = call @Unknown92(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %79 = call @Unknown92(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %80 = call @Unknown92(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %81 = call @Unknown92(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + return %41, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %39, %alloc_80 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/FW/5_alternative_scf_opt.mlir b/compiler/test/E2E/ResNet18/FW/5_alternative_scf_opt.mlir index 0a4ad01bd..766b56faa 100644 --- a/compiler/test/E2E/ResNet18/FW/5_alternative_scf_opt.mlir +++ b/compiler/test/E2E/ResNet18/FW/5_alternative_scf_opt.mlir @@ -2,924 +2,708 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1) -> (d0, d1)> -#map2 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> +#map1 = affine_map<(d0) -> (d0 mod 64, 49)> +#map2 = affine_map<(d0) -> (d0 mod 64 + 1, 49)> +#map3 = affine_map<(d0, d1) -> (d0 - d1)> +#map4 = affine_map<(d0) -> (d0 * 2)> +#map5 = affine_map<(d0) -> (d0 * 2 + 1)> module { func.func private @Unknown0(%arg0: memref<1x3x224x224xf32>) -> memref<1x3x224x224xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c224 = arith.constant 224 : index %alloc = memref.alloc() : memref<1x3x224x224xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x3x224x224xf32>) outs(%alloc : memref<1x3x224x224xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c3 step %c1 { + scf.for %arg2 = %c0 to %c224 step %c1 { + scf.for %arg3 = %c0 to %c224 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x3x224x224xf32> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x3x224x224xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x3x224x224xf16> } func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<64x3x7x7xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf32>) outs(%alloc : memref<64x3x7x7xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c3 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<64x3x7x7xf16> } func.func private @Unknown3(%arg0: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %alloc = memref.alloc() : memref<1x64x112x112xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x112x112xf16>) outs(%alloc : memref<1x64x112x112xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c112 step %c1 { + scf.for %arg3 = %c0 to %c112 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.maximumf %in, %cst : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x64x112x112xf16> } func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<64x64x3x3xf16> } func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c56 step %c1 { + scf.for %arg3 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.maximumf %in, %cst : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x64x56x56xf16> } - func.func private @Unknown7(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown10(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown12(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown13(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c56 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + %1 = arith.maximumf %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown16(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<128x64x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf32>) outs(%alloc : memref<128x64x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<128x64x1x1xf16> } func.func private @Unknown18(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf32>) outs(%alloc : memref<128x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<128x64x3x3xf16> } func.func private @Unknown20(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c28 step %c1 { + scf.for %arg3 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.maximumf %in, %cst : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown21(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<128x128x3x3xf16> } func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown24(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown26(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown27(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c28 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + %1 = arith.maximumf %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown30(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<256x128x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf32>) outs(%alloc : memref<256x128x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<256x128x1x1xf16> } func.func private @Unknown32(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf32>) outs(%alloc : memref<256x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<256x128x3x3xf16> } func.func private @Unknown34(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c14 step %c1 { + scf.for %arg3 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.maximumf %in, %cst : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown35(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<256x256x3x3xf16> } func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown38(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown40(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown41(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c14 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + %1 = arith.maximumf %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown44(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf32>) outs(%alloc : memref<512x256x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<512x256x1x1xf16> } func.func private @Unknown46(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf32>) outs(%alloc : memref<512x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<512x256x3x3xf16> } func.func private @Unknown48(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c7 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.maximumf %in, %cst : f16 + linalg.yield %0 : f16 + } + } + } } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown49(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<512x512x3x3xf16> } func.func private @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown52(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown54(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.maxnumf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + %1 = arith.maximumf %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } } return %alloc : memref<1x512x7x7xf16> } - func.func private @Unknown55(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown58(%arg0: memref<1x512x7x7xf16>) -> memref<1x512xf16> attributes {__byteir_reduction_fusion__} { %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<1x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - %1 = arith.maxnumf %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown58(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<1x512x7x7xf16> into memref<512x49xf16> + %alloc = memref.alloc() : memref<512xf16> + scf.forall (%arg1) in (512) { + %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = affine.min #map1(%arg2) + %1 = affine.min #map2(%arg2) + %2 = affine.apply #map3(%1, %0) + %subview_6 = memref.subview %expand_shape_0[0, %0] [1, %2] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4 = scf.if %3 -> (f16) { + %6 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %6 : f16 + } else { + scf.yield %cst : f16 + } + %5 = arith.addf %4, %cst : f16 + memref.store %5, %alloca[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<512xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<512xf16> into memref<1x512xf16> + return %expand_shape : memref<1x512xf16> + } + func.func private @Unknown59(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 2.040100e-02 : f16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<1x512xf16> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1x512xf16>) outs(%alloc : memref<1x512xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.mulf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[0, %arg1] [1, 1] [1, 1] : memref<1x512xf16> to memref> + %subview_0 = memref.subview %alloc[0, %arg1] [1, 1] [1, 1] : memref<1x512xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.mulf %in, %cst : f16 + linalg.yield %0 : f16 + } } return %alloc : memref<1x512xf16> } - func.func private @Unknown59(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown60(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<1000x512xf16> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf32>) outs(%alloc : memref<1000x512xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c1000 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<1000x512xf16> } - func.func private @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} { - %expand_shape = memref.expand_shape %arg0 [[0, 1]] : memref<1000xf32> into memref<1x1000xf32> + func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<1x1000xf16> - linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %expand_shape : memref<1x1000xf16>, memref<1x1000xf32>) outs(%alloc : memref<1x1000xf16>) { - ^bb0(%in: f16, %in_0: f32, %out: f16): - %0 = arith.truncf %in_0 : f32 to f16 - %1 = arith.addf %in, %0 : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg2] [1] [1] : memref<1000xf32> to memref> + %subview_0 = memref.subview %alloc[0, %arg2] [1, 1] [1, 1] : memref<1x1000xf16> to memref> + %subview_1 = memref.subview %arg1[0, %arg2] [1, 1] [1, 1] : memref<1x1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %in_2: f16, %out: f16): + %0 = arith.truncf %in : f32 to f16 + %1 = arith.addf %in_2, %0 : f16 + linalg.yield %1 : f16 + } } return %alloc : memref<1x1000xf16> } - func.func private @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } func.func private @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 1.000000e-01 : f32 %cst_0 = arith.constant 0.899999976 : f32 + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 + scf.for %arg2 = %c0 to %c64 step %c1 { + %subview = memref.subview %arg1[%arg2] [1] [1] : memref<64xf32> to memref> + %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<64xf32> to memref> + %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<64xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f32, %in_3: f32, %out: f32): + %0 = arith.mulf %in, %cst_0 : f32 + %1 = arith.mulf %in_3, %cst : f32 + %2 = arith.addf %1, %0 : f32 + linalg.yield %2 : f32 + } } return %alloc : memref<64xf32> } - func.func private @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<64xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<64xf32> - } - func.func private @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } func.func private @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 1.000000e-01 : f32 %cst_0 = arith.constant 0.899999976 : f32 + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 + scf.for %arg2 = %c0 to %c128 step %c1 { + %subview = memref.subview %arg1[%arg2] [1] [1] : memref<128xf32> to memref> + %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<128xf32> to memref> + %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<128xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f32, %in_3: f32, %out: f32): + %0 = arith.mulf %in, %cst_0 : f32 + %1 = arith.mulf %in_3, %cst : f32 + %2 = arith.addf %1, %0 : f32 + linalg.yield %2 : f32 + } } return %alloc : memref<128xf32> } - func.func private @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<128xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<128xf32> - } - func.func private @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } func.func private @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 1.000000e-01 : f32 %cst_0 = arith.constant 0.899999976 : f32 + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 + scf.for %arg2 = %c0 to %c256 step %c1 { + %subview = memref.subview %arg1[%arg2] [1] [1] : memref<256xf32> to memref> + %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<256xf32> to memref> + %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<256xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f32, %in_3: f32, %out: f32): + %0 = arith.mulf %in, %cst_0 : f32 + %1 = arith.mulf %in_3, %cst : f32 + %2 = arith.addf %1, %0 : f32 + linalg.yield %2 : f32 + } } return %alloc : memref<256xf32> } - func.func private @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<256xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<256xf32> - } - func.func private @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } func.func private @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 1.000000e-01 : f32 %cst_0 = arith.constant 0.899999976 : f32 + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 - } - return %alloc : memref<512xf32> - } - func.func private @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 1.000000e-01 : f32 - %cst_0 = arith.constant 0.899999976 : f32 - %alloc = memref.alloc() : memref<512xf32> - linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %0 = arith.mulf %in_1, %cst_0 : f32 - %1 = arith.mulf %in, %cst : f32 - %2 = arith.addf %1, %0 : f32 - linalg.yield %2 : f32 + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg1[%arg2] [1] [1] : memref<512xf32> to memref> + %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<512xf32> to memref> + %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<512xf32> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f32, %in_3: f32, %out: f32): + %0 = arith.mulf %in, %cst_0 : f32 + %1 = arith.mulf %in_3, %cst : f32 + %2 = arith.addf %1, %0 : f32 + linalg.yield %2 : f32 + } } return %alloc : memref<512xf32> } @@ -943,7 +727,7 @@ module { %alloc_7 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_4, %arg6, %arg5, %alloc_5, %alloc_6, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> %4 = call @Unknown6(%alloc_5) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %5 = call @Unknown7(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %5 = call @Unknown4(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_8 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%4, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_9 = memref.alloc() : memref<1x64x56x56xf16> @@ -951,22 +735,22 @@ module { %alloc_11 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_8, %arg8, %arg7, %alloc_9, %alloc_10, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> %6 = call @Unknown9(%alloc_9, %alloc_3) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %7 = call @Unknown10(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %7 = call @Unknown4(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_12 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%6, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_13 = memref.alloc() : memref<1x64x56x56xf16> %alloc_14 = memref.alloc() : memref<64xf32> %alloc_15 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_12, %arg12, %arg11, %alloc_13, %alloc_14, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %8 = call @Unknown12(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %9 = call @Unknown13(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %8 = call @Unknown6(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %9 = call @Unknown4(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_16 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%8, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_17 = memref.alloc() : memref<1x64x56x56xf16> %alloc_18 = memref.alloc() : memref<64xf32> %alloc_19 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_16, %arg14, %arg13, %alloc_17, %alloc_18, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %10 = call @Unknown15(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %10 = call @Unknown9(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %11 = call @Unknown16(%arg23) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> %alloc_20 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%10, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16> @@ -990,22 +774,22 @@ module { %alloc_31 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_28, %arg20, %arg19, %alloc_29, %alloc_30, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> %15 = call @Unknown23(%alloc_29, %alloc_21) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> - %16 = call @Unknown24(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %16 = call @Unknown21(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %alloc_32 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%15, %16, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_33 = memref.alloc() : memref<1x128x28x28xf16> %alloc_34 = memref.alloc() : memref<128xf32> %alloc_35 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_32, %arg27, %arg26, %alloc_33, %alloc_34, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %17 = call @Unknown26(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> - %18 = call @Unknown27(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %17 = call @Unknown20(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %18 = call @Unknown21(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %alloc_36 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%17, %18, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_37 = memref.alloc() : memref<1x128x28x28xf16> %alloc_38 = memref.alloc() : memref<128xf32> %alloc_39 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_36, %arg29, %arg28, %alloc_37, %alloc_38, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %19 = call @Unknown29(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %19 = call @Unknown23(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %20 = call @Unknown30(%arg38) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> %alloc_40 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%19, %20, %alloc_40) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16> @@ -1029,22 +813,22 @@ module { %alloc_51 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_48, %arg35, %arg34, %alloc_49, %alloc_50, %alloc_51) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> %24 = call @Unknown37(%alloc_49, %alloc_41) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> - %25 = call @Unknown38(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %25 = call @Unknown35(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %alloc_52 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%24, %25, %alloc_52) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_53 = memref.alloc() : memref<1x256x14x14xf16> %alloc_54 = memref.alloc() : memref<256xf32> %alloc_55 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_52, %arg42, %arg41, %alloc_53, %alloc_54, %alloc_55) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %26 = call @Unknown40(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> - %27 = call @Unknown41(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %26 = call @Unknown34(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %27 = call @Unknown35(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %alloc_56 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%26, %27, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_57 = memref.alloc() : memref<1x256x14x14xf16> %alloc_58 = memref.alloc() : memref<256xf32> %alloc_59 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_56, %arg44, %arg43, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %28 = call @Unknown43(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %28 = call @Unknown37(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %29 = call @Unknown44(%arg53) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> %alloc_60 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%28, %29, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16> @@ -1068,71 +852,70 @@ module { %alloc_71 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_68, %arg50, %arg49, %alloc_69, %alloc_70, %alloc_71) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> %33 = call @Unknown51(%alloc_69, %alloc_61) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %34 = call @Unknown52(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %34 = call @Unknown49(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %alloc_72 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%33, %34, %alloc_72) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_73 = memref.alloc() : memref<1x512x7x7xf16> %alloc_74 = memref.alloc() : memref<512xf32> %alloc_75 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_72, %arg57, %arg56, %alloc_73, %alloc_74, %alloc_75) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %35 = call @Unknown54(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %36 = call @Unknown55(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %35 = call @Unknown48(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %36 = call @Unknown49(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %alloc_76 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%35, %36, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_77 = memref.alloc() : memref<1x512x7x7xf16> %alloc_78 = memref.alloc() : memref<512xf32> %alloc_79 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_76, %arg59, %arg58, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %37 = call @Unknown57(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %alloc_80 = memref.alloc() : memref<1x512xf16> - byre.compute @ReduceSumOp_f16_f16(%37, %alloc_80) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<1x512xf16> - %38 = call @Unknown58(%alloc_80) : (memref<1x512xf16>) -> memref<1x512xf16> - %39 = call @Unknown59(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16> - %alloc_81 = memref.alloc() : memref<512x1000xf16> - byre.compute @TransposeOp_f16_f16(%39, %alloc_81) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16> - %alloc_82 = memref.alloc() : memref<1x1000xf16> - byre.compute @MatmulOp_f16f16_f16(%38, %39, %alloc_82) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16> - %40 = call @Unknown60(%arg3, %alloc_82) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16> - %41 = call @Unknown61(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %42 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %43 = call @Unknown63(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %44 = call @Unknown64(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %45 = call @Unknown65(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %46 = call @Unknown66(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %47 = call @Unknown67(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %48 = call @Unknown68(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %49 = call @Unknown69(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %50 = call @Unknown70(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %51 = call @Unknown71(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %52 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %53 = call @Unknown73(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %54 = call @Unknown74(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %55 = call @Unknown75(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %56 = call @Unknown76(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %57 = call @Unknown77(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %58 = call @Unknown78(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %59 = call @Unknown79(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %60 = call @Unknown80(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %61 = call @Unknown81(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %62 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %63 = call @Unknown83(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %64 = call @Unknown84(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %65 = call @Unknown85(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %66 = call @Unknown86(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %67 = call @Unknown87(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %68 = call @Unknown88(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %69 = call @Unknown89(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %70 = call @Unknown90(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %71 = call @Unknown91(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %72 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %73 = call @Unknown93(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %74 = call @Unknown94(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %75 = call @Unknown95(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %76 = call @Unknown96(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %77 = call @Unknown97(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %78 = call @Unknown98(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %79 = call @Unknown99(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %80 = call @Unknown100(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - return %40, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %38, %alloc_81 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16> + %37 = call @Unknown51(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %38 = call @Unknown58(%37) : (memref<1x512x7x7xf16>) -> memref<1x512xf16> + %39 = call @Unknown59(%38) : (memref<1x512xf16>) -> memref<1x512xf16> + %40 = call @Unknown60(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16> + %alloc_80 = memref.alloc() : memref<512x1000xf16> + byre.compute @TransposeOp_f16_f16(%40, %alloc_80) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16> + %alloc_81 = memref.alloc() : memref<1x1000xf16> + byre.compute @MatmulOp_f16f16_f16(%39, %40, %alloc_81) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16> + %41 = call @Unknown61(%arg3, %alloc_81) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16> + %42 = call @Unknown62(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %43 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %44 = call @Unknown62(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %45 = call @Unknown62(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %46 = call @Unknown62(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %47 = call @Unknown62(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %48 = call @Unknown62(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %49 = call @Unknown62(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %50 = call @Unknown62(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %51 = call @Unknown62(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %52 = call @Unknown72(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %53 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %54 = call @Unknown72(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %55 = call @Unknown72(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %56 = call @Unknown72(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %57 = call @Unknown72(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %58 = call @Unknown72(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %59 = call @Unknown72(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %60 = call @Unknown72(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %61 = call @Unknown72(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %62 = call @Unknown82(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %63 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %64 = call @Unknown82(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %65 = call @Unknown82(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %66 = call @Unknown82(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %67 = call @Unknown82(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %68 = call @Unknown82(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %69 = call @Unknown82(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %70 = call @Unknown82(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %71 = call @Unknown82(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %72 = call @Unknown92(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %73 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %74 = call @Unknown92(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %75 = call @Unknown92(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %76 = call @Unknown92(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %77 = call @Unknown92(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %78 = call @Unknown92(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %79 = call @Unknown92(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %80 = call @Unknown92(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %81 = call @Unknown92(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + return %41, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %39, %alloc_80 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/FW/6_gpu_opt.mlir b/compiler/test/E2E/ResNet18/FW/6_gpu_opt.mlir index 51d756ad5..41347f655 100644 --- a/compiler/test/E2E/ResNet18/FW/6_gpu_opt.mlir +++ b/compiler/test/E2E/ResNet18/FW/6_gpu_opt.mlir @@ -4,1492 +4,501 @@ module { func.func private @Unknown0(%arg0: memref<1x3x224x224xf32>) -> memref<1x3x224x224xf16> attributes {__byteir_elementwise_fusion__} { + %c224 = arith.constant 224 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c150528 = arith.constant 150528 : index - %c1 = arith.constant 1 : index - %c224 = arith.constant 224 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x3x224x224xf16> scf.for %arg1 = %c0 to %c150528 step %c1 { %0 = arith.remsi %arg1, %c224 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c224 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c224 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c224 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c224 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c224 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x3x224x224xf32> - %21 = arith.truncf %20 : f32 to f16 - memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x3x224x224xf16> + %1 = arith.divsi %arg1, %c224 : index + %2 = arith.remsi %1, %c224 : index + %3 = arith.divsi %1, %c224 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x3x224x224xf32> + %5 = arith.truncf %4 : f32 to f16 + memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x3x224x224xf16> } return %alloc : memref<1x3x224x224xf16> } func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c1 = arith.constant 1 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c9408 = arith.constant 9408 : index %alloc = memref.alloc() : memref<64x3x7x7xf16> scf.for %arg1 = %c0 to %c9408 step %c1 { %0 = arith.remsi %arg1, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c3 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c3 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c3 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x3x7x7xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x3x7x7xf16> + %1 = arith.divsi %arg1, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = arith.remsi %3, %c3 : index + %5 = arith.divsi %3, %c3 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x3x7x7xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x3x7x7xf16> } return %alloc : memref<64x3x7x7xf16> } func.func private @Unknown3(%arg0: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c112 = arith.constant 112 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c802816 = arith.constant 802816 : index - %c1 = arith.constant 1 : index - %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x64x112x112xf16> scf.for %arg1 = %c0 to %c802816 step %c1 { %0 = arith.remsi %arg1, %c112 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c112 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c112 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c112 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c112 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c112 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x112x112xf16> - %21 = arith.maxnumf %20, %cst : f16 - memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x64x112x112xf16> + %1 = arith.divsi %arg1, %c112 : index + %2 = arith.remsi %1, %c112 : index + %3 = arith.divsi %1, %c112 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x112x112xf16> + %5 = arith.maximumf %4, %cst : f16 + memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x64x112x112xf16> } return %alloc : memref<1x64x112x112xf16> } func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16> - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x64x56x56xf16> - scf.for %arg1 = %c0 to %c200704 step %c1 { - %0 = arith.remsi %arg1, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %21 = arith.maxnumf %20, %cst : f16 - memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown7(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index %c0 = arith.constant 0 : index %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<64x64x3x3xf16> scf.for %arg1 = %c0 to %c36864 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x64x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x64x3x3xf16> } return %alloc : memref<64x64x3x3xf16> } - func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index + func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x64x56x56xf16> - scf.for %arg2 = %c0 to %c200704 step %c1 { - %0 = arith.remsi %arg2, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %22 = arith.addf %20, %21 : f16 - %23 = arith.maxnumf %22, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - } - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown10(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16> - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown12(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> scf.for %arg1 = %c0 to %c200704 step %c1 { %0 = arith.remsi %arg1, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %21 = arith.maxnumf %20, %cst : f16 - memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16> + %1 = arith.divsi %arg1, %c56 : index + %2 = arith.remsi %1, %c56 : index + %3 = arith.divsi %1, %c56 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x56x56xf16> + %5 = arith.maximumf %4, %cst : f16 + memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x64x56x56xf16> } return %alloc : memref<1x64x56x56xf16> } - func.func private @Unknown13(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index + func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16> - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x64x56x56xf16> scf.for %arg2 = %c0 to %c200704 step %c1 { %0 = arith.remsi %arg2, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16> - %22 = arith.addf %20, %21 : f16 - %23 = arith.maxnumf %22, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16> + %1 = arith.divsi %arg2, %c56 : index + %2 = arith.remsi %1, %c56 : index + %3 = arith.divsi %1, %c56 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x56x56xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x64x56x56xf16> + %6 = arith.addf %4, %5 : f16 + %7 = arith.maximumf %6, %cst : f16 + memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x64x56x56xf16> } return %alloc : memref<1x64x56x56xf16> } func.func private @Unknown16(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<128x64x1x1xf16> scf.for %arg1 = %c0 to %c8192 step %c1 { %0 = arith.remsi %arg1, %c64 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c64 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c64 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<128x64x1x1xf32> - %11 = arith.truncf %10 : f32 to f16 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<128x64x1x1xf16> + %1 = arith.divsi %arg1, %c64 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<128x64x1x1xf32> + %3 = arith.truncf %2 : f32 to f16 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<128x64x1x1xf16> } return %alloc : memref<128x64x1x1xf16> } func.func private @Unknown18(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c73728 = arith.constant 73728 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c73728 = arith.constant 73728 : index %alloc = memref.alloc() : memref<128x64x3x3xf16> scf.for %arg1 = %c0 to %c73728 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x64x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x64x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x64x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x64x3x3xf16> } return %alloc : memref<128x64x3x3xf16> } func.func private @Unknown20(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c28 = arith.constant 28 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> scf.for %arg1 = %c0 to %c100352 step %c1 { %0 = arith.remsi %arg1, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %21 = arith.maxnumf %20, %cst : f16 - memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16> + %1 = arith.divsi %arg1, %c28 : index + %2 = arith.remsi %1, %c28 : index + %3 = arith.divsi %1, %c28 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x128x28x28xf16> + %5 = arith.maximumf %4, %cst : f16 + memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x128x28x28xf16> } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown21(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<128x128x3x3xf16> - scf.for %arg1 = %c0 to %c147456 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16> - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c1 = arith.constant 1 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x128x28x28xf16> - scf.for %arg2 = %c0 to %c100352 step %c1 { - %0 = arith.remsi %arg2, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %22 = arith.addf %20, %21 : f16 - %23 = arith.maxnumf %22, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown24(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index %c0 = arith.constant 0 : index %c147456 = arith.constant 147456 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<128x128x3x3xf16> scf.for %arg1 = %c0 to %c147456 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x128x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x128x3x3xf16> } return %alloc : memref<128x128x3x3xf16> } - func.func private @Unknown26(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index + func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x128x28x28xf16> - scf.for %arg1 = %c0 to %c100352 step %c1 { - %0 = arith.remsi %arg1, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %21 = arith.maxnumf %20, %cst : f16 - memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - } - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown27(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<128x128x3x3xf16> - scf.for %arg1 = %c0 to %c147456 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16> - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x128x28x28xf16> scf.for %arg2 = %c0 to %c100352 step %c1 { %0 = arith.remsi %arg2, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16> - %22 = arith.addf %20, %21 : f16 - %23 = arith.maxnumf %22, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16> + %1 = arith.divsi %arg2, %c28 : index + %2 = arith.remsi %1, %c28 : index + %3 = arith.divsi %1, %c28 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x128x28x28xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x128x28x28xf16> + %6 = arith.addf %4, %5 : f16 + %7 = arith.maximumf %6, %cst : f16 + memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x128x28x28xf16> } return %alloc : memref<1x128x28x28xf16> } func.func private @Unknown30(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c32768 = arith.constant 32768 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<256x128x1x1xf16> scf.for %arg1 = %c0 to %c32768 step %c1 { %0 = arith.remsi %arg1, %c128 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c128 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c128 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<256x128x1x1xf32> - %11 = arith.truncf %10 : f32 to f16 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<256x128x1x1xf16> + %1 = arith.divsi %arg1, %c128 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<256x128x1x1xf32> + %3 = arith.truncf %2 : f32 to f16 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<256x128x1x1xf16> } return %alloc : memref<256x128x1x1xf16> } func.func private @Unknown32(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c294912 = arith.constant 294912 : index %alloc = memref.alloc() : memref<256x128x3x3xf16> scf.for %arg1 = %c0 to %c294912 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x128x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x128x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x128x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x128x3x3xf16> } return %alloc : memref<256x128x3x3xf16> } func.func private @Unknown34(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c14 = arith.constant 14 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c50176 = arith.constant 50176 : index - %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> scf.for %arg1 = %c0 to %c50176 step %c1 { %0 = arith.remsi %arg1, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %21 = arith.maxnumf %20, %cst : f16 - memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16> + %1 = arith.divsi %arg1, %c14 : index + %2 = arith.remsi %1, %c14 : index + %3 = arith.divsi %1, %c14 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x256x14x14xf16> + %5 = arith.maximumf %4, %cst : f16 + memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x256x14x14xf16> } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown35(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<256x256x3x3xf16> - scf.for %arg1 = %c0 to %c589824 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16> - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x256x14x14xf16> - scf.for %arg2 = %c0 to %c50176 step %c1 { - %0 = arith.remsi %arg2, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %22 = arith.addf %20, %21 : f16 - %23 = arith.maxnumf %22, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown38(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index %c0 = arith.constant 0 : index %c589824 = arith.constant 589824 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x256x3x3xf16> scf.for %arg1 = %c0 to %c589824 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x256x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x256x3x3xf16> } return %alloc : memref<256x256x3x3xf16> } - func.func private @Unknown40(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c1 = arith.constant 1 : index + func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x256x14x14xf16> - scf.for %arg1 = %c0 to %c50176 step %c1 { - %0 = arith.remsi %arg1, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %21 = arith.maxnumf %20, %cst : f16 - memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - } - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown41(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<256x256x3x3xf16> - scf.for %arg1 = %c0 to %c589824 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16> - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c50176 = arith.constant 50176 : index - %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x256x14x14xf16> scf.for %arg2 = %c0 to %c50176 step %c1 { %0 = arith.remsi %arg2, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16> - %22 = arith.addf %20, %21 : f16 - %23 = arith.maxnumf %22, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16> + %1 = arith.divsi %arg2, %c14 : index + %2 = arith.remsi %1, %c14 : index + %3 = arith.divsi %1, %c14 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x256x14x14xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x256x14x14xf16> + %6 = arith.addf %4, %5 : f16 + %7 = arith.maximumf %6, %cst : f16 + memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x256x14x14xf16> } return %alloc : memref<1x256x14x14xf16> } func.func private @Unknown44(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c131072 = arith.constant 131072 : index - %c1 = arith.constant 1 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<512x256x1x1xf16> scf.for %arg1 = %c0 to %c131072 step %c1 { %0 = arith.remsi %arg1, %c256 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c256 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c256 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<512x256x1x1xf32> - %11 = arith.truncf %10 : f32 to f16 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<512x256x1x1xf16> + %1 = arith.divsi %arg1, %c256 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<512x256x1x1xf32> + %3 = arith.truncf %2 : f32 to f16 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<512x256x1x1xf16> } return %alloc : memref<512x256x1x1xf16> } func.func private @Unknown46(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1179648 = arith.constant 1179648 : index %alloc = memref.alloc() : memref<512x256x3x3xf16> scf.for %arg1 = %c0 to %c1179648 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x256x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x256x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x256x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x256x3x3xf16> } return %alloc : memref<512x256x3x3xf16> } func.func private @Unknown48(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c7 = arith.constant 7 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c25088 = arith.constant 25088 : index - %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> scf.for %arg1 = %c0 to %c25088 step %c1 { %0 = arith.remsi %arg1, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %21 = arith.maxnumf %20, %cst : f16 - memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16> + %1 = arith.divsi %arg1, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x512x7x7xf16> + %5 = arith.maximumf %4, %cst : f16 + memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x512x7x7xf16> } return %alloc : memref<1x512x7x7xf16> } func.func private @Unknown49(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + %c1 = arith.constant 1 : index %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index + %c2359296 = arith.constant 2359296 : index %alloc = memref.alloc() : memref<512x512x3x3xf16> scf.for %arg1 = %c0 to %c2359296 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c512 : index + %5 = arith.divsi %3, %c512 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x512x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x512x3x3xf16> } return %alloc : memref<512x512x3x3xf16> } func.func private @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + %c7 = arith.constant 7 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c25088 = arith.constant 25088 : index - %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1x512x7x7xf16> scf.for %arg2 = %c0 to %c25088 step %c1 { %0 = arith.remsi %arg2, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %22 = arith.addf %20, %21 : f16 - %23 = arith.maxnumf %22, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16> + %1 = arith.divsi %arg2, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x512x7x7xf16> + %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x512x7x7xf16> + %6 = arith.addf %4, %5 : f16 + %7 = arith.maximumf %6, %cst : f16 + memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x512x7x7xf16> } return %alloc : memref<1x512x7x7xf16> } - func.func private @Unknown52(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown58(%arg0: memref<1x512x7x7xf16>) -> memref<1x512xf16> attributes {__byteir_reduction_fusion__} { %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<512x512x3x3xf16> - scf.for %arg1 = %c0 to %c2359296 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16> - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown54(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c64 = arith.constant 64 : index + %c49 = arith.constant 49 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<1x512x7x7xf16> into memref<512x49xf16> + %alloc = memref.alloc() : memref<512xf16> + scf.forall (%arg1) in (512) { + %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = arith.remsi %arg2, %c64 : index + %1 = arith.cmpi slt, %0, %c0 : index + %2 = arith.addi %0, %c64 : index + %3 = arith.select %1, %2, %0 : index + %4 = arith.cmpi slt, %3, %c49 : index + %5 = arith.select %4, %3, %c49 : index + %6 = arith.addi %3, %c1 : index + %7 = arith.cmpi slt, %6, %c49 : index + %8 = arith.select %7, %6, %c49 : index + %9 = arith.subi %8, %5 : index + %subview_6 = memref.subview %expand_shape_0[0, %5] [1, %9] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %10 = arith.cmpi ugt, %9, %c0 : index + %11 = scf.if %10 -> (f16) { + %13 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %13 : f16 + } else { + scf.yield %cst : f16 + } + %12 = arith.addf %11, %cst : f16 + memref.store %12, %alloca[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<512xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<512xf16> into memref<1x512xf16> + return %expand_shape : memref<1x512xf16> + } + func.func private @Unknown59(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x512x7x7xf16> - scf.for %arg1 = %c0 to %c25088 step %c1 { - %0 = arith.remsi %arg1, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %21 = arith.maxnumf %20, %cst : f16 - memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - } - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown55(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<512x512x3x3xf16> - scf.for %arg1 = %c0 to %c2359296 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16> - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %alloc = memref.alloc() : memref<1x512x7x7xf16> - scf.for %arg2 = %c0 to %c25088 step %c1 { - %0 = arith.remsi %arg2, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - %22 = arith.addf %20, %21 : f16 - %23 = arith.maxnumf %22, %cst : f16 - memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16> - } - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown58(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 2.040100e-02 : f16 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<1x512xf16> scf.for %arg1 = %c0 to %c512 step %c1 { %0 = memref.load %arg0[%c0, %arg1] : memref<1x512xf16> @@ -1498,719 +507,98 @@ module { } return %alloc : memref<1x512xf16> } - func.func private @Unknown59(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown60(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c512000 = arith.constant 512000 : index - %c1 = arith.constant 1 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1000x512xf16> scf.for %arg1 = %c0 to %c512000 step %c1 { %0 = arith.remsi %arg1, %c512 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c512 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c512 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3] : memref<1000x512xf32> - %11 = arith.truncf %10 : f32 to f16 - memref.store %11, %alloc[%9, %3] : memref<1000x512xf16> + %1 = arith.divsi %arg1, %c512 : index + %2 = memref.load %arg0[%1, %0] : memref<1000x512xf32> + %3 = arith.truncf %2 : f32 to f16 + memref.store %3, %alloc[%1, %0] : memref<1000x512xf16> } return %alloc : memref<1000x512xf16> } - func.func private @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index + func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} { %c1000 = arith.constant 1000 : index %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %alloc = memref.alloc() : memref<1x1000xf16> scf.for %arg2 = %c0 to %c1000 step %c1 { - %0 = memref.load %arg1[%c0, %arg2] : memref<1x1000xf16> - %1 = memref.load %arg0[%arg2] : memref<1000xf32> - %2 = arith.truncf %1 : f32 to f16 - %3 = arith.addf %0, %2 : f16 + %0 = memref.load %arg0[%arg2] : memref<1000xf32> + %1 = memref.load %arg1[%c0, %arg2] : memref<1x1000xf16> + %2 = arith.truncf %0 : f32 to f16 + %3 = arith.addf %1, %2 : f16 memref.store %3, %alloc[%c0, %arg2] : memref<1x1000xf16> } return %alloc : memref<1x1000xf16> } - func.func private @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - scf.for %arg2 = %c0 to %c64 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<64xf32> - %1 = memref.load %arg1[%arg2] : memref<64xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<64xf32> - } - return %alloc : memref<64xf32> - } func.func private @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - scf.for %arg2 = %c0 to %c64 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<64xf32> - %1 = memref.load %arg1[%arg2] : memref<64xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<64xf32> - } - return %alloc : memref<64xf32> - } - func.func private @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - scf.for %arg2 = %c0 to %c64 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<64xf32> - %1 = memref.load %arg1[%arg2] : memref<64xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<64xf32> - } - return %alloc : memref<64xf32> - } - func.func private @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - scf.for %arg2 = %c0 to %c64 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<64xf32> - %1 = memref.load %arg1[%arg2] : memref<64xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<64xf32> - } - return %alloc : memref<64xf32> - } - func.func private @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - scf.for %arg2 = %c0 to %c64 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<64xf32> - %1 = memref.load %arg1[%arg2] : memref<64xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<64xf32> - } - return %alloc : memref<64xf32> - } - func.func private @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - scf.for %arg2 = %c0 to %c64 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<64xf32> - %1 = memref.load %arg1[%arg2] : memref<64xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<64xf32> - } - return %alloc : memref<64xf32> - } - func.func private @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - scf.for %arg2 = %c0 to %c64 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<64xf32> - %1 = memref.load %arg1[%arg2] : memref<64xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<64xf32> - } - return %alloc : memref<64xf32> - } - func.func private @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - scf.for %arg2 = %c0 to %c64 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<64xf32> - %1 = memref.load %arg1[%arg2] : memref<64xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<64xf32> - } - return %alloc : memref<64xf32> - } - func.func private @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - scf.for %arg2 = %c0 to %c64 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<64xf32> - %1 = memref.load %arg1[%arg2] : memref<64xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<64xf32> - } - return %alloc : memref<64xf32> - } - func.func private @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<64xf32> scf.for %arg2 = %c0 to %c64 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<64xf32> - %1 = memref.load %arg1[%arg2] : memref<64xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 + %0 = memref.load %arg1[%arg2] : memref<64xf32> + %1 = memref.load %arg0[%arg2] : memref<64xf32> + %2 = arith.mulf %0, %cst : f32 + %3 = arith.mulf %1, %cst_0 : f32 %4 = arith.addf %3, %2 : f32 memref.store %4, %alloc[%arg2] : memref<64xf32> } return %alloc : memref<64xf32> } - func.func private @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - scf.for %arg2 = %c0 to %c128 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<128xf32> - %1 = memref.load %arg1[%arg2] : memref<128xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<128xf32> - } - return %alloc : memref<128xf32> - } func.func private @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - scf.for %arg2 = %c0 to %c128 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<128xf32> - %1 = memref.load %arg1[%arg2] : memref<128xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<128xf32> - } - return %alloc : memref<128xf32> - } - func.func private @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - scf.for %arg2 = %c0 to %c128 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<128xf32> - %1 = memref.load %arg1[%arg2] : memref<128xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<128xf32> - } - return %alloc : memref<128xf32> - } - func.func private @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - scf.for %arg2 = %c0 to %c128 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<128xf32> - %1 = memref.load %arg1[%arg2] : memref<128xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<128xf32> - } - return %alloc : memref<128xf32> - } - func.func private @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - scf.for %arg2 = %c0 to %c128 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<128xf32> - %1 = memref.load %arg1[%arg2] : memref<128xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<128xf32> - } - return %alloc : memref<128xf32> - } - func.func private @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - scf.for %arg2 = %c0 to %c128 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<128xf32> - %1 = memref.load %arg1[%arg2] : memref<128xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<128xf32> - } - return %alloc : memref<128xf32> - } - func.func private @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - scf.for %arg2 = %c0 to %c128 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<128xf32> - %1 = memref.load %arg1[%arg2] : memref<128xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<128xf32> - } - return %alloc : memref<128xf32> - } - func.func private @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - scf.for %arg2 = %c0 to %c128 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<128xf32> - %1 = memref.load %arg1[%arg2] : memref<128xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<128xf32> - } - return %alloc : memref<128xf32> - } - func.func private @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - scf.for %arg2 = %c0 to %c128 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<128xf32> - %1 = memref.load %arg1[%arg2] : memref<128xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<128xf32> - } - return %alloc : memref<128xf32> - } - func.func private @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<128xf32> scf.for %arg2 = %c0 to %c128 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<128xf32> - %1 = memref.load %arg1[%arg2] : memref<128xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 + %0 = memref.load %arg1[%arg2] : memref<128xf32> + %1 = memref.load %arg0[%arg2] : memref<128xf32> + %2 = arith.mulf %0, %cst : f32 + %3 = arith.mulf %1, %cst_0 : f32 %4 = arith.addf %3, %2 : f32 memref.store %4, %alloc[%arg2] : memref<128xf32> } return %alloc : memref<128xf32> } - func.func private @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<256xf32> - scf.for %arg2 = %c0 to %c256 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<256xf32> - %1 = memref.load %arg1[%arg2] : memref<256xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<256xf32> - } - return %alloc : memref<256xf32> - } func.func private @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<256xf32> - scf.for %arg2 = %c0 to %c256 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<256xf32> - %1 = memref.load %arg1[%arg2] : memref<256xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<256xf32> - } - return %alloc : memref<256xf32> - } - func.func private @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<256xf32> - scf.for %arg2 = %c0 to %c256 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<256xf32> - %1 = memref.load %arg1[%arg2] : memref<256xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<256xf32> - } - return %alloc : memref<256xf32> - } - func.func private @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<256xf32> - scf.for %arg2 = %c0 to %c256 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<256xf32> - %1 = memref.load %arg1[%arg2] : memref<256xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<256xf32> - } - return %alloc : memref<256xf32> - } - func.func private @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index %c256 = arith.constant 256 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<256xf32> - scf.for %arg2 = %c0 to %c256 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<256xf32> - %1 = memref.load %arg1[%arg2] : memref<256xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<256xf32> - } - return %alloc : memref<256xf32> - } - func.func private @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<256xf32> - scf.for %arg2 = %c0 to %c256 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<256xf32> - %1 = memref.load %arg1[%arg2] : memref<256xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<256xf32> - } - return %alloc : memref<256xf32> - } - func.func private @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<256xf32> - scf.for %arg2 = %c0 to %c256 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<256xf32> - %1 = memref.load %arg1[%arg2] : memref<256xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<256xf32> - } - return %alloc : memref<256xf32> - } - func.func private @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<256xf32> - scf.for %arg2 = %c0 to %c256 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<256xf32> - %1 = memref.load %arg1[%arg2] : memref<256xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<256xf32> - } - return %alloc : memref<256xf32> - } - func.func private @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<256xf32> - scf.for %arg2 = %c0 to %c256 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<256xf32> - %1 = memref.load %arg1[%arg2] : memref<256xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<256xf32> - } - return %alloc : memref<256xf32> - } - func.func private @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<256xf32> scf.for %arg2 = %c0 to %c256 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<256xf32> - %1 = memref.load %arg1[%arg2] : memref<256xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 + %0 = memref.load %arg1[%arg2] : memref<256xf32> + %1 = memref.load %arg0[%arg2] : memref<256xf32> + %2 = arith.mulf %0, %cst : f32 + %3 = arith.mulf %1, %cst_0 : f32 %4 = arith.addf %3, %2 : f32 memref.store %4, %alloc[%arg2] : memref<256xf32> } return %alloc : memref<256xf32> } - func.func private @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<512xf32> - scf.for %arg2 = %c0 to %c512 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<512xf32> - %1 = memref.load %arg1[%arg2] : memref<512xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<512xf32> - } - return %alloc : memref<512xf32> - } func.func private @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<512xf32> - scf.for %arg2 = %c0 to %c512 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<512xf32> - %1 = memref.load %arg1[%arg2] : memref<512xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<512xf32> - } - return %alloc : memref<512xf32> - } - func.func private @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<512xf32> - scf.for %arg2 = %c0 to %c512 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<512xf32> - %1 = memref.load %arg1[%arg2] : memref<512xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<512xf32> - } - return %alloc : memref<512xf32> - } - func.func private @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<512xf32> - scf.for %arg2 = %c0 to %c512 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<512xf32> - %1 = memref.load %arg1[%arg2] : memref<512xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<512xf32> - } - return %alloc : memref<512xf32> - } - func.func private @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<512xf32> - scf.for %arg2 = %c0 to %c512 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<512xf32> - %1 = memref.load %arg1[%arg2] : memref<512xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<512xf32> - } - return %alloc : memref<512xf32> - } - func.func private @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<512xf32> - scf.for %arg2 = %c0 to %c512 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<512xf32> - %1 = memref.load %arg1[%arg2] : memref<512xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<512xf32> - } - return %alloc : memref<512xf32> - } - func.func private @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<512xf32> - scf.for %arg2 = %c0 to %c512 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<512xf32> - %1 = memref.load %arg1[%arg2] : memref<512xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<512xf32> - } - return %alloc : memref<512xf32> - } - func.func private @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.899999976 : f32 %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<512xf32> - scf.for %arg2 = %c0 to %c512 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<512xf32> - %1 = memref.load %arg1[%arg2] : memref<512xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<512xf32> - } - return %alloc : memref<512xf32> - } - func.func private @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<512xf32> - scf.for %arg2 = %c0 to %c512 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<512xf32> - %1 = memref.load %arg1[%arg2] : memref<512xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 - %4 = arith.addf %3, %2 : f32 - memref.store %4, %alloc[%arg2] : memref<512xf32> - } - return %alloc : memref<512xf32> - } - func.func private @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<512xf32> scf.for %arg2 = %c0 to %c512 step %c1 { - %0 = memref.load %arg0[%arg2] : memref<512xf32> - %1 = memref.load %arg1[%arg2] : memref<512xf32> - %2 = arith.mulf %1, %cst : f32 - %3 = arith.mulf %0, %cst_0 : f32 + %0 = memref.load %arg1[%arg2] : memref<512xf32> + %1 = memref.load %arg0[%arg2] : memref<512xf32> + %2 = arith.mulf %0, %cst : f32 + %3 = arith.mulf %1, %cst_0 : f32 %4 = arith.addf %3, %2 : f32 memref.store %4, %alloc[%arg2] : memref<512xf32> } @@ -2236,7 +624,7 @@ module { %alloc_7 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_4, %arg6, %arg5, %alloc_5, %alloc_6, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> %4 = call @Unknown6(%alloc_5) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %5 = call @Unknown7(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %5 = call @Unknown4(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_8 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%4, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_9 = memref.alloc() : memref<1x64x56x56xf16> @@ -2244,22 +632,22 @@ module { %alloc_11 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_8, %arg8, %arg7, %alloc_9, %alloc_10, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> %6 = call @Unknown9(%alloc_9, %alloc_3) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %7 = call @Unknown10(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %7 = call @Unknown4(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_12 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%6, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_13 = memref.alloc() : memref<1x64x56x56xf16> %alloc_14 = memref.alloc() : memref<64xf32> %alloc_15 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_12, %arg12, %arg11, %alloc_13, %alloc_14, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %8 = call @Unknown12(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %9 = call @Unknown13(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %8 = call @Unknown6(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %9 = call @Unknown4(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_16 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%8, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_17 = memref.alloc() : memref<1x64x56x56xf16> %alloc_18 = memref.alloc() : memref<64xf32> %alloc_19 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_16, %arg14, %arg13, %alloc_17, %alloc_18, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %10 = call @Unknown15(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %10 = call @Unknown9(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %11 = call @Unknown16(%arg23) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> %alloc_20 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%10, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16> @@ -2283,22 +671,22 @@ module { %alloc_31 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_28, %arg20, %arg19, %alloc_29, %alloc_30, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> %15 = call @Unknown23(%alloc_29, %alloc_21) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> - %16 = call @Unknown24(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %16 = call @Unknown21(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %alloc_32 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%15, %16, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_33 = memref.alloc() : memref<1x128x28x28xf16> %alloc_34 = memref.alloc() : memref<128xf32> %alloc_35 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_32, %arg27, %arg26, %alloc_33, %alloc_34, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %17 = call @Unknown26(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> - %18 = call @Unknown27(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %17 = call @Unknown20(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %18 = call @Unknown21(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %alloc_36 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%17, %18, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_37 = memref.alloc() : memref<1x128x28x28xf16> %alloc_38 = memref.alloc() : memref<128xf32> %alloc_39 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_36, %arg29, %arg28, %alloc_37, %alloc_38, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %19 = call @Unknown29(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %19 = call @Unknown23(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %20 = call @Unknown30(%arg38) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> %alloc_40 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%19, %20, %alloc_40) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16> @@ -2322,22 +710,22 @@ module { %alloc_51 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_48, %arg35, %arg34, %alloc_49, %alloc_50, %alloc_51) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> %24 = call @Unknown37(%alloc_49, %alloc_41) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> - %25 = call @Unknown38(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %25 = call @Unknown35(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %alloc_52 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%24, %25, %alloc_52) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_53 = memref.alloc() : memref<1x256x14x14xf16> %alloc_54 = memref.alloc() : memref<256xf32> %alloc_55 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_52, %arg42, %arg41, %alloc_53, %alloc_54, %alloc_55) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %26 = call @Unknown40(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> - %27 = call @Unknown41(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %26 = call @Unknown34(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %27 = call @Unknown35(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %alloc_56 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%26, %27, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_57 = memref.alloc() : memref<1x256x14x14xf16> %alloc_58 = memref.alloc() : memref<256xf32> %alloc_59 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_56, %arg44, %arg43, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %28 = call @Unknown43(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %28 = call @Unknown37(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %29 = call @Unknown44(%arg53) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> %alloc_60 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%28, %29, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16> @@ -2361,71 +749,70 @@ module { %alloc_71 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_68, %arg50, %arg49, %alloc_69, %alloc_70, %alloc_71) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> %33 = call @Unknown51(%alloc_69, %alloc_61) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %34 = call @Unknown52(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %34 = call @Unknown49(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %alloc_72 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%33, %34, %alloc_72) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_73 = memref.alloc() : memref<1x512x7x7xf16> %alloc_74 = memref.alloc() : memref<512xf32> %alloc_75 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_72, %arg57, %arg56, %alloc_73, %alloc_74, %alloc_75) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %35 = call @Unknown54(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %36 = call @Unknown55(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %35 = call @Unknown48(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %36 = call @Unknown49(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %alloc_76 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%35, %36, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_77 = memref.alloc() : memref<1x512x7x7xf16> %alloc_78 = memref.alloc() : memref<512xf32> %alloc_79 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_76, %arg59, %arg58, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %37 = call @Unknown57(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %alloc_80 = memref.alloc() : memref<1x512xf16> - byre.compute @ReduceSumOp_f16_f16(%37, %alloc_80) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<1x512xf16> - %38 = call @Unknown58(%alloc_80) : (memref<1x512xf16>) -> memref<1x512xf16> - %39 = call @Unknown59(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16> - %alloc_81 = memref.alloc() : memref<512x1000xf16> - byre.compute @TransposeOp_f16_f16(%39, %alloc_81) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16> - %alloc_82 = memref.alloc() : memref<1x1000xf16> - byre.compute @MatmulOp_f16f16_f16(%38, %39, %alloc_82) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16> - %40 = call @Unknown60(%arg3, %alloc_82) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16> - %41 = call @Unknown61(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %42 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %43 = call @Unknown63(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %44 = call @Unknown64(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %45 = call @Unknown65(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %46 = call @Unknown66(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %47 = call @Unknown67(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %48 = call @Unknown68(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %49 = call @Unknown69(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %50 = call @Unknown70(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %51 = call @Unknown71(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %52 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %53 = call @Unknown73(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %54 = call @Unknown74(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %55 = call @Unknown75(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %56 = call @Unknown76(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %57 = call @Unknown77(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %58 = call @Unknown78(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %59 = call @Unknown79(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %60 = call @Unknown80(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %61 = call @Unknown81(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %62 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %63 = call @Unknown83(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %64 = call @Unknown84(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %65 = call @Unknown85(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %66 = call @Unknown86(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %67 = call @Unknown87(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %68 = call @Unknown88(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %69 = call @Unknown89(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %70 = call @Unknown90(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %71 = call @Unknown91(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %72 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %73 = call @Unknown93(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %74 = call @Unknown94(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %75 = call @Unknown95(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %76 = call @Unknown96(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %77 = call @Unknown97(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %78 = call @Unknown98(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %79 = call @Unknown99(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %80 = call @Unknown100(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - return %40, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %38, %alloc_81 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16> + %37 = call @Unknown51(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %38 = call @Unknown58(%37) : (memref<1x512x7x7xf16>) -> memref<1x512xf16> + %39 = call @Unknown59(%38) : (memref<1x512xf16>) -> memref<1x512xf16> + %40 = call @Unknown60(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16> + %alloc_80 = memref.alloc() : memref<512x1000xf16> + byre.compute @TransposeOp_f16_f16(%40, %alloc_80) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16> + %alloc_81 = memref.alloc() : memref<1x1000xf16> + byre.compute @MatmulOp_f16f16_f16(%39, %40, %alloc_81) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16> + %41 = call @Unknown61(%arg3, %alloc_81) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16> + %42 = call @Unknown62(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %43 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %44 = call @Unknown62(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %45 = call @Unknown62(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %46 = call @Unknown62(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %47 = call @Unknown62(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %48 = call @Unknown62(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %49 = call @Unknown62(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %50 = call @Unknown62(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %51 = call @Unknown62(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %52 = call @Unknown72(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %53 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %54 = call @Unknown72(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %55 = call @Unknown72(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %56 = call @Unknown72(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %57 = call @Unknown72(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %58 = call @Unknown72(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %59 = call @Unknown72(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %60 = call @Unknown72(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %61 = call @Unknown72(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %62 = call @Unknown82(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %63 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %64 = call @Unknown82(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %65 = call @Unknown82(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %66 = call @Unknown82(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %67 = call @Unknown82(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %68 = call @Unknown82(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %69 = call @Unknown82(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %70 = call @Unknown82(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %71 = call @Unknown82(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %72 = call @Unknown92(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %73 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %74 = call @Unknown92(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %75 = call @Unknown92(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %76 = call @Unknown92(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %77 = call @Unknown92(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %78 = call @Unknown92(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %79 = call @Unknown92(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %80 = call @Unknown92(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %81 = call @Unknown92(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + return %41, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %39, %alloc_80 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/FW/7_set_space_opt.mlir b/compiler/test/E2E/ResNet18/FW/7_set_space_opt.mlir index 881ac0f3e..61f68bea1 100644 --- a/compiler/test/E2E/ResNet18/FW/7_set_space_opt.mlir +++ b/compiler/test/E2E/ResNet18/FW/7_set_space_opt.mlir @@ -1,3133 +1,998 @@ -// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s +// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -inline -gpu-launch-func-to-byre -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s // CHECK-LABEL: func.func @main module attributes {gpu.container_module} { gpu.module @unified { - gpu.func @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c512 step %6 { + %7 = memref.load %arg1[%arg3] : memref<512xf32> + %8 = memref.load %arg0[%arg3] : memref<512xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<512xf32> } gpu.return } - gpu.func @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index + gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c256 step %6 { + %7 = memref.load %arg1[%arg3] : memref<256xf32> + %8 = memref.load %arg0[%arg3] : memref<256xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<256xf32> } gpu.return } - gpu.func @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index + gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c128 step %6 { + %7 = memref.load %arg1[%arg3] : memref<128xf32> + %8 = memref.load %arg0[%arg3] : memref<128xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<128xf32> } gpu.return } - gpu.func @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index + gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c64 step %6 { + %7 = memref.load %arg1[%arg3] : memref<64xf32> + %8 = memref.load %arg0[%arg3] : memref<64xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<64xf32> } gpu.return } - gpu.func @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index + gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg3] : memref<1000xf32> + %8 = memref.load %arg1[%c0, %arg3] : memref<1x1000xf16> + %9 = arith.truncf %7 : f32 to f16 + %10 = arith.addf %8, %9 : f16 + memref.store %10, %arg2[%c0, %arg3] : memref<1x1000xf16> } gpu.return } - gpu.func @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown60(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { + %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf16> } gpu.return } - gpu.func @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown59(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel { + %cst = arith.constant 2.040100e-02 : f16 + %c0 = arith.constant 0 : index %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512 step %6 { + %7 = memref.load %arg0[%c0, %arg2] : memref<1x512xf16> + %8 = arith.mulf %7, %cst : f16 + memref.store %8, %arg1[%c0, %arg2] : memref<1x512xf16> } gpu.return } - gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index + gpu.func @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } - gpu.func @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown49(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { + %c2359296 = arith.constant 2359296 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16> } gpu.return } - gpu.func @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index + gpu.func @Unknown48(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } - gpu.func @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown46(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel { + %c1179648 = arith.constant 1179648 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16> } gpu.return } - gpu.func @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown44(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel { + %c131072 = arith.constant 131072 : index + %c0 = arith.constant 0 : index %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> } gpu.return } - gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index + gpu.func @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } - gpu.func @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown35(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { + %c589824 = arith.constant 589824 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16> } gpu.return } - gpu.func @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index + gpu.func @Unknown34(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg2, %c14 : index + %8 = arith.divsi %arg2, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } - gpu.func @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown32(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel { + %c294912 = arith.constant 294912 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16> } gpu.return } - gpu.func @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown30(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel { + %c32768 = arith.constant 32768 : index + %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> } gpu.return } - gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index + gpu.func @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } - gpu.func @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown21(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { + %c147456 = arith.constant 147456 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16> } gpu.return } - gpu.func @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index + gpu.func @Unknown20(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg2, %c28 : index + %8 = arith.divsi %arg2, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } - gpu.func @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown18(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel { + %c73728 = arith.constant 73728 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16> } gpu.return } - gpu.func @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown16(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel { + %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> } gpu.return } - gpu.func @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index + gpu.func @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } - gpu.func @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index + gpu.func @Unknown6(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg2, %c56 : index + %8 = arith.divsi %arg2, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } - gpu.func @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 + gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { + %c36864 = arith.constant 36864 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16> } gpu.return } - gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index + gpu.func @Unknown3(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) kernel { + %c802816 = arith.constant 802816 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c112 = arith.constant 112 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg2, %c112 : index + %8 = arith.divsi %arg2, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16> } gpu.return } - gpu.func @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index + gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { + %c9408 = arith.constant 9408 : index + %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16> } gpu.return } - gpu.func @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel { + gpu.func @Unknown0(%arg0: memref<1x3x224x224xf32>, %arg1: memref<1x3x224x224xf16>) kernel { + %c150528 = arith.constant 150528 : index %c0 = arith.constant 0 : index - %c1000 = arith.constant 1000 : index + %c224 = arith.constant 224 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg1[%c0, %4] : memref<1x1000xf16> - %7 = memref.load %arg0[%4] : memref<1000xf32> - %8 = arith.truncf %7 : f32 to f16 - %9 = arith.addf %6, %8 : f16 - memref.store %9, %arg2[%c0, %4] : memref<1x1000xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c150528 step %6 { + %7 = arith.remsi %arg2, %c224 : index + %8 = arith.divsi %arg2, %c224 : index + %9 = arith.remsi %8, %c224 : index + %10 = arith.divsi %8, %c224 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x3x224x224xf32> + %12 = arith.truncf %11 : f32 to f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x3x224x224xf16> } gpu.return } - gpu.func @Unknown59(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { - %c0 = arith.constant 0 : index - %c512000 = arith.constant 512000 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown58_kernel(%arg0: memref<512x49xf16>, %arg1: memref<512xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf16> - } - gpu.return - } - gpu.func @Unknown58(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel { - %cst = arith.constant 2.040100e-02 : f16 + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = gpu.thread_id x + %4 = gpu.thread_id y + %5 = gpu.thread_id z + %6 = gpu.grid_dim x + %7 = gpu.grid_dim y + %8 = gpu.grid_dim z + %9 = gpu.block_dim x + %10 = gpu.block_dim y + %11 = gpu.block_dim z + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %c64 = arith.constant 64 : index %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%c0, %4] : memref<1x512xf16> - %7 = arith.mulf %6, %cst : f16 - memref.store %7, %arg1[%c0, %4] : memref<1x512xf16> - } - gpu.return - } - gpu.func @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { + %c49 = arith.constant 49 : index + %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown55(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown54(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown52(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown49(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown48(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown46(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown44(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel { - %c0 = arith.constant 0 : index - %c131072 = arith.constant 131072 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - } - gpu.return - } - gpu.func @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown41(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown40(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown38(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown35(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown34(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown32(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown30(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel { - %c0 = arith.constant 0 : index - %c32768 = arith.constant 32768 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - } - gpu.return - } - gpu.func @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown27(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown26(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown24(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown21(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown20(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown18(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown16(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel { - %c0 = arith.constant 0 : index - %c8192 = arith.constant 8192 : index - %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - } - gpu.return - } - gpu.func @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown13(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown12(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown10(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown7(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown6(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown3(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - } - gpu.return - } - gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16> - } - gpu.return - } - gpu.func @Unknown0(%arg0: memref<1x3x224x224xf32>, %arg1: memref<1x3x224x224xf16>) kernel { - %c0 = arith.constant 0 : index - %c150528 = arith.constant 150528 : index - %c224 = arith.constant 224 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c150528 : index - scf.if %5 { - %6 = arith.remsi %4, %c224 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c224 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c224 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c224 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c224 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c224 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x3x224x224xf32> - %27 = arith.truncf %26 : f32 to f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x3x224x224xf16> - } - gpu.return - } - } - func.func private @Unknown0(%arg0: memref<1x3x224x224xf32>) -> memref<1x3x224x224xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1176 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1176 = arith.constant 1176 : index - %alloc = memref.alloc() : memref<1x3x224x224xf16> - gpu.launch_func @unified::@Unknown0 blocks in (%c1176, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x3x224x224xf32>, %alloc : memref<1x3x224x224xf16>) - return %alloc : memref<1x3x224x224xf16> - } - func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c74 = arith.constant 74 : index - %alloc = memref.alloc() : memref<64x3x7x7xf16> - gpu.launch_func @unified::@Unknown1 blocks in (%c74, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x3x7x7xf32>, %alloc : memref<64x3x7x7xf16>) - return %alloc : memref<64x3x7x7xf16> - } - func.func private @Unknown3(%arg0: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index - %alloc = memref.alloc() : memref<1x64x112x112xf16> - gpu.launch_func @unified::@Unknown3 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x112x112xf16>, %alloc : memref<1x64x112x112xf16>) - return %alloc : memref<1x64x112x112xf16> - } - func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - gpu.launch_func @unified::@Unknown4 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>) - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index - %alloc = memref.alloc() : memref<1x64x56x56xf16> - gpu.launch_func @unified::@Unknown6 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown7(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - gpu.launch_func @unified::@Unknown7 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>) - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index - %alloc = memref.alloc() : memref<1x64x56x56xf16> - gpu.launch_func @unified::@Unknown9 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown10(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown10", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - gpu.launch_func @unified::@Unknown10 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>) - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown12(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index - %alloc = memref.alloc() : memref<1x64x56x56xf16> - gpu.launch_func @unified::@Unknown12 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown13(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - gpu.launch_func @unified::@Unknown13 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>) - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown15", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index - %alloc = memref.alloc() : memref<1x64x56x56xf16> - gpu.launch_func @unified::@Unknown15 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) - return %alloc : memref<1x64x56x56xf16> - } - func.func private @Unknown16(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<128x64x1x1xf16> - gpu.launch_func @unified::@Unknown16 blocks in (%c64, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x1x1xf32>, %alloc : memref<128x64x1x1xf16>) - return %alloc : memref<128x64x1x1xf16> - } - func.func private @Unknown18(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c576 = arith.constant 576 : index - %alloc = memref.alloc() : memref<128x64x3x3xf16> - gpu.launch_func @unified::@Unknown18 blocks in (%c576, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x3x3xf32>, %alloc : memref<128x64x3x3xf16>) - return %alloc : memref<128x64x3x3xf16> - } - func.func private @Unknown20(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index - %alloc = memref.alloc() : memref<1x128x28x28xf16> - gpu.launch_func @unified::@Unknown20 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown21(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index - %alloc = memref.alloc() : memref<128x128x3x3xf16> - gpu.launch_func @unified::@Unknown21 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>) - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index - %alloc = memref.alloc() : memref<1x128x28x28xf16> - gpu.launch_func @unified::@Unknown23 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown24(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index - %alloc = memref.alloc() : memref<128x128x3x3xf16> - gpu.launch_func @unified::@Unknown24 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>) - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown26(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index - %alloc = memref.alloc() : memref<1x128x28x28xf16> - gpu.launch_func @unified::@Unknown26 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown27(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown27", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index - %alloc = memref.alloc() : memref<128x128x3x3xf16> - gpu.launch_func @unified::@Unknown27 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>) - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown29", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index - %alloc = memref.alloc() : memref<1x128x28x28xf16> - gpu.launch_func @unified::@Unknown29 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) - return %alloc : memref<1x128x28x28xf16> - } - func.func private @Unknown30(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<256x128x1x1xf16> - gpu.launch_func @unified::@Unknown30 blocks in (%c256, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x1x1xf32>, %alloc : memref<256x128x1x1xf16>) - return %alloc : memref<256x128x1x1xf16> - } - func.func private @Unknown32(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c2304 = arith.constant 2304 : index - %alloc = memref.alloc() : memref<256x128x3x3xf16> - gpu.launch_func @unified::@Unknown32 blocks in (%c2304, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x3x3xf32>, %alloc : memref<256x128x3x3xf16>) - return %alloc : memref<256x128x3x3xf16> - } - func.func private @Unknown34(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown34", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c392 = arith.constant 392 : index - %alloc = memref.alloc() : memref<1x256x14x14xf16> - gpu.launch_func @unified::@Unknown34 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown35(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index - %alloc = memref.alloc() : memref<256x256x3x3xf16> - gpu.launch_func @unified::@Unknown35 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>) - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c392 = arith.constant 392 : index - %alloc = memref.alloc() : memref<1x256x14x14xf16> - gpu.launch_func @unified::@Unknown37 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown38(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index - %alloc = memref.alloc() : memref<256x256x3x3xf16> - gpu.launch_func @unified::@Unknown38 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>) - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown40(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown40", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c392 = arith.constant 392 : index - %alloc = memref.alloc() : memref<1x256x14x14xf16> - gpu.launch_func @unified::@Unknown40 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown41(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown41", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index - %alloc = memref.alloc() : memref<256x256x3x3xf16> - gpu.launch_func @unified::@Unknown41 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>) - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown43", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c392 = arith.constant 392 : index - %alloc = memref.alloc() : memref<1x256x14x14xf16> - gpu.launch_func @unified::@Unknown43 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) - return %alloc : memref<1x256x14x14xf16> - } - func.func private @Unknown44(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1024 = arith.constant 1024 : index - %alloc = memref.alloc() : memref<512x256x1x1xf16> - gpu.launch_func @unified::@Unknown44 blocks in (%c1024, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x1x1xf32>, %alloc : memref<512x256x1x1xf16>) - return %alloc : memref<512x256x1x1xf16> - } - func.func private @Unknown46(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c9216 = arith.constant 9216 : index - %alloc = memref.alloc() : memref<512x256x3x3xf16> - gpu.launch_func @unified::@Unknown46 blocks in (%c9216, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x3x3xf32>, %alloc : memref<512x256x3x3xf16>) - return %alloc : memref<512x256x3x3xf16> - } - func.func private @Unknown48(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c196 = arith.constant 196 : index - %alloc = memref.alloc() : memref<1x512x7x7xf16> - gpu.launch_func @unified::@Unknown48 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown49(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown49", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index - %alloc = memref.alloc() : memref<512x512x3x3xf16> - gpu.launch_func @unified::@Unknown49 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>) - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown51", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c196 = arith.constant 196 : index - %alloc = memref.alloc() : memref<1x512x7x7xf16> - gpu.launch_func @unified::@Unknown51 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown52(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown52", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index - %alloc = memref.alloc() : memref<512x512x3x3xf16> - gpu.launch_func @unified::@Unknown52 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>) - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown54(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown54", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c196 = arith.constant 196 : index - %alloc = memref.alloc() : memref<1x512x7x7xf16> - gpu.launch_func @unified::@Unknown54 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown55(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index - %alloc = memref.alloc() : memref<512x512x3x3xf16> - gpu.launch_func @unified::@Unknown55 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>) - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c196 = arith.constant 196 : index - %alloc = memref.alloc() : memref<1x512x7x7xf16> - gpu.launch_func @unified::@Unknown57 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) - return %alloc : memref<1x512x7x7xf16> - } - func.func private @Unknown58(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown58", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<1x512xf16> - gpu.launch_func @unified::@Unknown58 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512xf16>, %alloc : memref<1x512xf16>) - return %alloc : memref<1x512xf16> - } - func.func private @Unknown59(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4000 = arith.constant 4000 : index - %alloc = memref.alloc() : memref<1000x512xf16> - gpu.launch_func @unified::@Unknown59 blocks in (%c4000, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000x512xf32>, %alloc : memref<1000x512xf16>) - return %alloc : memref<1000x512xf16> - } - func.func private @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %alloc = memref.alloc() : memref<1x1000xf16> - gpu.launch_func @unified::@Unknown60 blocks in (%c8, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000xf32>, %arg1 : memref<1x1000xf16>, %alloc : memref<1x1000xf16>) - return %alloc : memref<1x1000xf16> - } - func.func private @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - gpu.launch_func @unified::@Unknown61 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) - return %alloc : memref<64xf32> - } - func.func private @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + %c2 = arith.constant 2 : index + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %12 = gpu.block_id x + %subview = memref.subview %arg0[%12, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + %13 = gpu.thread_id x + %14 = arith.remsi %13, %c64 : index + %15 = arith.cmpi slt, %14, %c0 : index + %16 = arith.addi %14, %c64 : index + %17 = arith.select %15, %16, %14 : index + %18 = arith.cmpi slt, %17, %c49 : index + %19 = arith.select %18, %17, %c49 : index + %20 = arith.addi %17, %c1 : index + %21 = arith.cmpi slt, %20, %c49 : index + %22 = arith.select %21, %20, %c49 : index + %23 = arith.subi %22, %19 : index + %subview_0 = memref.subview %expand_shape[0, %19] [1, %23] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %24 = arith.cmpi ugt, %23, %c0 : index + %25 = scf.if %24 -> (f16) { + %33 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %33 : f16 + } else { + scf.yield %cst : f16 + } + %26 = arith.addf %25, %cst : f16 + memref.store %26, %alloca[%13] : memref<64xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space> + %27 = arith.cmpi ult, %13, %c32 : index + scf.if %27 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca[%33] : memref<64xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca[%36] : memref<64xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_2[%13] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space> + %28 = arith.cmpi ult, %13, %c16 : index + scf.if %28 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca_2[%33] : memref<32xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca_2[%36] : memref<32xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_3[%13] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space> + %29 = arith.cmpi ult, %13, %c8 : index + scf.if %29 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca_3[%33] : memref<16xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca_3[%36] : memref<16xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_4[%13] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space> + %30 = arith.cmpi ult, %13, %c4 : index + scf.if %30 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca_4[%33] : memref<8xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca_4[%36] : memref<8xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_5[%13] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space> + %31 = arith.cmpi ult, %13, %c2 : index + scf.if %31 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca_5[%33] : memref<4xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca_5[%36] : memref<4xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_6[%13] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %32 = arith.cmpi ult, %13, %c1 : index + scf.if %32 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca_6[%33] : memref<2xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca_6[%36] : memref<2xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %arg1[%12] : memref<512xf16> + } + gpu.barrier + gpu.return + } + } + func.func private @Unknown0(%arg0: memref<1x3x224x224xf32>) -> memref<1x3x224x224xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 147 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c147 = arith.constant 147 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - gpu.launch_func @unified::@Unknown62 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) - return %alloc : memref<64xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x3x224x224xf16> + gpu.launch_func @unified::@Unknown0 blocks in (%c147, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x3x224x224xf32>, %alloc : memref<1x3x224x224xf16>) + return %alloc : memref<1x3x224x224xf16> } - func.func private @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c10 = arith.constant 10 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - gpu.launch_func @unified::@Unknown63 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) - return %alloc : memref<64xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<64x3x7x7xf16> + gpu.launch_func @unified::@Unknown1 blocks in (%c10, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<64x3x7x7xf32>, %alloc : memref<64x3x7x7xf16>) + return %alloc : memref<64x3x7x7xf16> } - func.func private @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown3(%arg0: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c784 = arith.constant 784 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - gpu.launch_func @unified::@Unknown64 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) - return %alloc : memref<64xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x64x112x112xf16> + gpu.launch_func @unified::@Unknown3 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x64x112x112xf16>, %alloc : memref<1x64x112x112xf16>) + return %alloc : memref<1x64x112x112xf16> } - func.func private @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown65", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c36 = arith.constant 36 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - gpu.launch_func @unified::@Unknown65 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) - return %alloc : memref<64xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<64x64x3x3xf16> + gpu.launch_func @unified::@Unknown4 blocks in (%c36, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>) + return %alloc : memref<64x64x3x3xf16> } - func.func private @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown66", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c196 = arith.constant 196 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - gpu.launch_func @unified::@Unknown66 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) - return %alloc : memref<64xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x64x56x56xf16> + gpu.launch_func @unified::@Unknown6 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) + return %alloc : memref<1x64x56x56xf16> } - func.func private @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown67", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c196 = arith.constant 196 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - gpu.launch_func @unified::@Unknown67 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) - return %alloc : memref<64xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x64x56x56xf16> + gpu.launch_func @unified::@Unknown9 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>) + return %alloc : memref<1x64x56x56xf16> } - func.func private @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown16(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c8 = arith.constant 8 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - gpu.launch_func @unified::@Unknown68 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) - return %alloc : memref<64xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<128x64x1x1xf16> + gpu.launch_func @unified::@Unknown16 blocks in (%c8, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x64x1x1xf32>, %alloc : memref<128x64x1x1xf16>) + return %alloc : memref<128x64x1x1xf16> } - func.func private @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown18(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c72 = arith.constant 72 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - gpu.launch_func @unified::@Unknown69 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) - return %alloc : memref<64xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<128x64x3x3xf16> + gpu.launch_func @unified::@Unknown18 blocks in (%c72, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x64x3x3xf32>, %alloc : memref<128x64x3x3xf16>) + return %alloc : memref<128x64x3x3xf16> } - func.func private @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown70", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown20(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c98 = arith.constant 98 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<64xf32> - gpu.launch_func @unified::@Unknown70 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) - return %alloc : memref<64xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x128x28x28xf16> + gpu.launch_func @unified::@Unknown20 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) + return %alloc : memref<1x128x28x28xf16> } - func.func private @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown71", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown21(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c144 = arith.constant 144 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - gpu.launch_func @unified::@Unknown71 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) - return %alloc : memref<128xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<128x128x3x3xf16> + gpu.launch_func @unified::@Unknown21 blocks in (%c144, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>) + return %alloc : memref<128x128x3x3xf16> } - func.func private @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c98 = arith.constant 98 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - gpu.launch_func @unified::@Unknown72 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) - return %alloc : memref<128xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x128x28x28xf16> + gpu.launch_func @unified::@Unknown23 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>) + return %alloc : memref<1x128x28x28xf16> } - func.func private @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown30(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c32 = arith.constant 32 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - gpu.launch_func @unified::@Unknown73 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) - return %alloc : memref<128xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<256x128x1x1xf16> + gpu.launch_func @unified::@Unknown30 blocks in (%c32, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x128x1x1xf32>, %alloc : memref<256x128x1x1xf16>) + return %alloc : memref<256x128x1x1xf16> } - func.func private @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown32(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c288 = arith.constant 288 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - gpu.launch_func @unified::@Unknown74 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) - return %alloc : memref<128xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<256x128x3x3xf16> + gpu.launch_func @unified::@Unknown32 blocks in (%c288, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x128x3x3xf32>, %alloc : memref<256x128x3x3xf16>) + return %alloc : memref<256x128x3x3xf16> } - func.func private @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown75", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown34(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown34", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c49 = arith.constant 49 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - gpu.launch_func @unified::@Unknown75 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) - return %alloc : memref<128xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x256x14x14xf16> + gpu.launch_func @unified::@Unknown34 blocks in (%c49, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) + return %alloc : memref<1x256x14x14xf16> } - func.func private @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown76", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown35(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c576 = arith.constant 576 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - gpu.launch_func @unified::@Unknown76 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) - return %alloc : memref<128xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<256x256x3x3xf16> + gpu.launch_func @unified::@Unknown35 blocks in (%c576, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>) + return %alloc : memref<256x256x3x3xf16> } - func.func private @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c49 = arith.constant 49 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - gpu.launch_func @unified::@Unknown77 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) - return %alloc : memref<128xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x256x14x14xf16> + gpu.launch_func @unified::@Unknown37 blocks in (%c49, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>) + return %alloc : memref<1x256x14x14xf16> } - func.func private @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + func.func private @Unknown44(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - gpu.launch_func @unified::@Unknown78 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) - return %alloc : memref<128xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<512x256x1x1xf16> + gpu.launch_func @unified::@Unknown44 blocks in (%c128, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x256x1x1xf32>, %alloc : memref<512x256x1x1xf16>) + return %alloc : memref<512x256x1x1xf16> } - func.func private @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown46(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c1152 = arith.constant 1152 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - gpu.launch_func @unified::@Unknown79 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) - return %alloc : memref<128xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<512x256x3x3xf16> + gpu.launch_func @unified::@Unknown46 blocks in (%c1152, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x256x3x3xf32>, %alloc : memref<512x256x3x3xf16>) + return %alloc : memref<512x256x3x3xf16> } - func.func private @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown48(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c25 = arith.constant 25 : index %c1 = arith.constant 1 : index - %alloc = memref.alloc() : memref<128xf32> - gpu.launch_func @unified::@Unknown80 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) - return %alloc : memref<128xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x512x7x7xf16> + gpu.launch_func @unified::@Unknown48 blocks in (%c25, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) + return %alloc : memref<1x512x7x7xf16> } - func.func private @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown81", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown49(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown49", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c2304 = arith.constant 2304 : index %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %alloc = memref.alloc() : memref<256xf32> - gpu.launch_func @unified::@Unknown81 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) - return %alloc : memref<256xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<512x512x3x3xf16> + gpu.launch_func @unified::@Unknown49 blocks in (%c2304, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>) + return %alloc : memref<512x512x3x3xf16> } - func.func private @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown51", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c25 = arith.constant 25 : index %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %alloc = memref.alloc() : memref<256xf32> - gpu.launch_func @unified::@Unknown82 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) - return %alloc : memref<256xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x512x7x7xf16> + gpu.launch_func @unified::@Unknown51 blocks in (%c25, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>) + return %alloc : memref<1x512x7x7xf16> } - func.func private @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index + func.func private @Unknown58(%arg0: memref<1x512x7x7xf16>) -> memref<1x512xf16> attributes {__byteir_reduction_fusion__} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index %c2 = arith.constant 2 : index - %alloc = memref.alloc() : memref<256xf32> - gpu.launch_func @unified::@Unknown83 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) - return %alloc : memref<256xf32> - } - func.func private @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %alloc = memref.alloc() : memref<256xf32> - gpu.launch_func @unified::@Unknown84 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) - return %alloc : memref<256xf32> + %c49 = arith.constant 49 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<1x512x7x7xf16> into memref<512x49xf16> + %alloc = memref.alloc() : memref<512xf16> + gpu.launch_func @unified::@Unknown58_kernel blocks in (%c512, %c1, %c1) threads in (%c64, %c1, %c1) args(%collapse_shape : memref<512x49xf16>, %alloc : memref<512xf16>) + %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<512xf16> into memref<1x512xf16> + return %expand_shape : memref<1x512xf16> } - func.func private @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown59(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %alloc = memref.alloc() : memref<256xf32> - gpu.launch_func @unified::@Unknown85 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) - return %alloc : memref<256xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x512xf16> + gpu.launch_func @unified::@Unknown59 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1x512xf16>, %alloc : memref<1x512xf16>) + return %alloc : memref<1x512xf16> } - func.func private @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown60(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c500 = arith.constant 500 : index %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %alloc = memref.alloc() : memref<256xf32> - gpu.launch_func @unified::@Unknown86 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) - return %alloc : memref<256xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1000x512xf16> + gpu.launch_func @unified::@Unknown60 blocks in (%c500, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1000x512xf32>, %alloc : memref<1000x512xf16>) + return %alloc : memref<1000x512xf16> } - func.func private @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %alloc = memref.alloc() : memref<256xf32> - gpu.launch_func @unified::@Unknown87 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) - return %alloc : memref<256xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1x1000xf16> + gpu.launch_func @unified::@Unknown61 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1000xf32>, %arg1 : memref<1x1000xf16>, %alloc : memref<1x1000xf16>) + return %alloc : memref<1x1000xf16> } - func.func private @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown88", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %alloc = memref.alloc() : memref<256xf32> - gpu.launch_func @unified::@Unknown88 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) - return %alloc : memref<256xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<64xf32> + gpu.launch_func @unified::@Unknown62 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>) + return %alloc : memref<64xf32> } - func.func private @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %alloc = memref.alloc() : memref<256xf32> - gpu.launch_func @unified::@Unknown89 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) - return %alloc : memref<256xf32> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<128xf32> + gpu.launch_func @unified::@Unknown72 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>) + return %alloc : memref<128xf32> } - func.func private @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256xf32> - gpu.launch_func @unified::@Unknown90 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) + gpu.launch_func @unified::@Unknown82 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>) return %alloc : memref<256xf32> } - func.func private @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<512xf32> - gpu.launch_func @unified::@Unknown91 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) - return %alloc : memref<512xf32> - } - func.func private @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<512xf32> - gpu.launch_func @unified::@Unknown92 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) - return %alloc : memref<512xf32> - } - func.func private @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<512xf32> - gpu.launch_func @unified::@Unknown93 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) - return %alloc : memref<512xf32> - } - func.func private @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<512xf32> - gpu.launch_func @unified::@Unknown94 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) - return %alloc : memref<512xf32> - } - func.func private @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<512xf32> - gpu.launch_func @unified::@Unknown95 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) - return %alloc : memref<512xf32> - } - func.func private @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<512xf32> - gpu.launch_func @unified::@Unknown96 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) - return %alloc : memref<512xf32> - } - func.func private @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown97", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<512xf32> - gpu.launch_func @unified::@Unknown97 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) - return %alloc : memref<512xf32> - } - func.func private @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown98", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<512xf32> - gpu.launch_func @unified::@Unknown98 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) - return %alloc : memref<512xf32> - } - func.func private @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown99", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<512xf32> - gpu.launch_func @unified::@Unknown99 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) - return %alloc : memref<512xf32> - } - func.func private @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown100", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512xf32> - gpu.launch_func @unified::@Unknown100 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) + gpu.launch_func @unified::@Unknown92 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>) return %alloc : memref<512xf32> } func.func @main(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64x3x7x7xf32>, %arg3: memref<1000xf32>, %arg4: memref<1000x512xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: memref<64xf32>, %arg9: memref<64x64x3x3xf32>, %arg10: memref<64x64x3x3xf32>, %arg11: memref<64xf32>, %arg12: memref<64xf32>, %arg13: memref<64xf32>, %arg14: memref<64xf32>, %arg15: memref<64x64x3x3xf32>, %arg16: memref<64x64x3x3xf32>, %arg17: memref<128xf32>, %arg18: memref<128xf32>, %arg19: memref<128xf32>, %arg20: memref<128xf32>, %arg21: memref<128x64x3x3xf32>, %arg22: memref<128x128x3x3xf32>, %arg23: memref<128x64x1x1xf32>, %arg24: memref<128xf32>, %arg25: memref<128xf32>, %arg26: memref<128xf32>, %arg27: memref<128xf32>, %arg28: memref<128xf32>, %arg29: memref<128xf32>, %arg30: memref<128x128x3x3xf32>, %arg31: memref<128x128x3x3xf32>, %arg32: memref<256xf32>, %arg33: memref<256xf32>, %arg34: memref<256xf32>, %arg35: memref<256xf32>, %arg36: memref<256x128x3x3xf32>, %arg37: memref<256x256x3x3xf32>, %arg38: memref<256x128x1x1xf32>, %arg39: memref<256xf32>, %arg40: memref<256xf32>, %arg41: memref<256xf32>, %arg42: memref<256xf32>, %arg43: memref<256xf32>, %arg44: memref<256xf32>, %arg45: memref<256x256x3x3xf32>, %arg46: memref<256x256x3x3xf32>, %arg47: memref<512xf32>, %arg48: memref<512xf32>, %arg49: memref<512xf32>, %arg50: memref<512xf32>, %arg51: memref<512x256x3x3xf32>, %arg52: memref<512x512x3x3xf32>, %arg53: memref<512x256x1x1xf32>, %arg54: memref<512xf32>, %arg55: memref<512xf32>, %arg56: memref<512xf32>, %arg57: memref<512xf32>, %arg58: memref<512xf32>, %arg59: memref<512xf32>, %arg60: memref<512x512x3x3xf32>, %arg61: memref<512x512x3x3xf32>, %arg62: memref, %arg63: memref<64xf32>, %arg64: memref<64xf32>, %arg65: memref, %arg66: memref<64xf32>, %arg67: memref<64xf32>, %arg68: memref, %arg69: memref<64xf32>, %arg70: memref<64xf32>, %arg71: memref, %arg72: memref<64xf32>, %arg73: memref<64xf32>, %arg74: memref, %arg75: memref<64xf32>, %arg76: memref<64xf32>, %arg77: memref, %arg78: memref<128xf32>, %arg79: memref<128xf32>, %arg80: memref, %arg81: memref<128xf32>, %arg82: memref<128xf32>, %arg83: memref, %arg84: memref<128xf32>, %arg85: memref<128xf32>, %arg86: memref, %arg87: memref<128xf32>, %arg88: memref<128xf32>, %arg89: memref, %arg90: memref<128xf32>, %arg91: memref<128xf32>, %arg92: memref, %arg93: memref<256xf32>, %arg94: memref<256xf32>, %arg95: memref, %arg96: memref<256xf32>, %arg97: memref<256xf32>, %arg98: memref, %arg99: memref<256xf32>, %arg100: memref<256xf32>, %arg101: memref, %arg102: memref<256xf32>, %arg103: memref<256xf32>, %arg104: memref, %arg105: memref<256xf32>, %arg106: memref<256xf32>, %arg107: memref, %arg108: memref<512xf32>, %arg109: memref<512xf32>, %arg110: memref, %arg111: memref<512xf32>, %arg112: memref<512xf32>, %arg113: memref, %arg114: memref<512xf32>, %arg115: memref<512xf32>, %arg116: memref, %arg117: memref<512xf32>, %arg118: memref<512xf32>, %arg119: memref, %arg120: memref<512xf32>, %arg121: memref<512xf32>, %arg122: memref<1x3x224x224xf32>) -> (memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16>) attributes {__placeholder__byre.entry_point} { @@ -3150,7 +1015,7 @@ module attributes {gpu.container_module} { %alloc_7 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_4, %arg6, %arg5, %alloc_5, %alloc_6, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> %4 = call @Unknown6(%alloc_5) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %5 = call @Unknown7(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %5 = call @Unknown4(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_8 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%4, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_9 = memref.alloc() : memref<1x64x56x56xf16> @@ -3158,22 +1023,22 @@ module attributes {gpu.container_module} { %alloc_11 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_8, %arg8, %arg7, %alloc_9, %alloc_10, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> %6 = call @Unknown9(%alloc_9, %alloc_3) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %7 = call @Unknown10(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %7 = call @Unknown4(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_12 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%6, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_13 = memref.alloc() : memref<1x64x56x56xf16> %alloc_14 = memref.alloc() : memref<64xf32> %alloc_15 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_12, %arg12, %arg11, %alloc_13, %alloc_14, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %8 = call @Unknown12(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> - %9 = call @Unknown13(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %8 = call @Unknown6(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %9 = call @Unknown4(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %alloc_16 = memref.alloc() : memref<1x64x56x56xf16> byre.compute @ConvOp_f16f16_f16(%8, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16> %alloc_17 = memref.alloc() : memref<1x64x56x56xf16> %alloc_18 = memref.alloc() : memref<64xf32> %alloc_19 = memref.alloc() : memref<64xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_16, %arg14, %arg13, %alloc_17, %alloc_18, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %10 = call @Unknown15(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> + %10 = call @Unknown9(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> %11 = call @Unknown16(%arg23) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> %alloc_20 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%10, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16> @@ -3197,22 +1062,22 @@ module attributes {gpu.container_module} { %alloc_31 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_28, %arg20, %arg19, %alloc_29, %alloc_30, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> %15 = call @Unknown23(%alloc_29, %alloc_21) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> - %16 = call @Unknown24(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %16 = call @Unknown21(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %alloc_32 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%15, %16, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_33 = memref.alloc() : memref<1x128x28x28xf16> %alloc_34 = memref.alloc() : memref<128xf32> %alloc_35 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_32, %arg27, %arg26, %alloc_33, %alloc_34, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %17 = call @Unknown26(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> - %18 = call @Unknown27(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %17 = call @Unknown20(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %18 = call @Unknown21(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %alloc_36 = memref.alloc() : memref<1x128x28x28xf16> byre.compute @ConvOp_f16f16_f16(%17, %18, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16> %alloc_37 = memref.alloc() : memref<1x128x28x28xf16> %alloc_38 = memref.alloc() : memref<128xf32> %alloc_39 = memref.alloc() : memref<128xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_36, %arg29, %arg28, %alloc_37, %alloc_38, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %19 = call @Unknown29(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> + %19 = call @Unknown23(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> %20 = call @Unknown30(%arg38) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> %alloc_40 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%19, %20, %alloc_40) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16> @@ -3236,22 +1101,22 @@ module attributes {gpu.container_module} { %alloc_51 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_48, %arg35, %arg34, %alloc_49, %alloc_50, %alloc_51) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> %24 = call @Unknown37(%alloc_49, %alloc_41) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> - %25 = call @Unknown38(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %25 = call @Unknown35(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %alloc_52 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%24, %25, %alloc_52) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_53 = memref.alloc() : memref<1x256x14x14xf16> %alloc_54 = memref.alloc() : memref<256xf32> %alloc_55 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_52, %arg42, %arg41, %alloc_53, %alloc_54, %alloc_55) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %26 = call @Unknown40(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> - %27 = call @Unknown41(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %26 = call @Unknown34(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %27 = call @Unknown35(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %alloc_56 = memref.alloc() : memref<1x256x14x14xf16> byre.compute @ConvOp_f16f16_f16(%26, %27, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16> %alloc_57 = memref.alloc() : memref<1x256x14x14xf16> %alloc_58 = memref.alloc() : memref<256xf32> %alloc_59 = memref.alloc() : memref<256xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_56, %arg44, %arg43, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %28 = call @Unknown43(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> + %28 = call @Unknown37(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> %29 = call @Unknown44(%arg53) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> %alloc_60 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%28, %29, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16> @@ -3275,71 +1140,70 @@ module attributes {gpu.container_module} { %alloc_71 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_68, %arg50, %arg49, %alloc_69, %alloc_70, %alloc_71) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> %33 = call @Unknown51(%alloc_69, %alloc_61) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %34 = call @Unknown52(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %34 = call @Unknown49(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %alloc_72 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%33, %34, %alloc_72) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_73 = memref.alloc() : memref<1x512x7x7xf16> %alloc_74 = memref.alloc() : memref<512xf32> %alloc_75 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_72, %arg57, %arg56, %alloc_73, %alloc_74, %alloc_75) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %35 = call @Unknown54(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %36 = call @Unknown55(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %35 = call @Unknown48(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %36 = call @Unknown49(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %alloc_76 = memref.alloc() : memref<1x512x7x7xf16> byre.compute @ConvOp_f16f16_f16(%35, %36, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16> %alloc_77 = memref.alloc() : memref<1x512x7x7xf16> %alloc_78 = memref.alloc() : memref<512xf32> %alloc_79 = memref.alloc() : memref<512xf32> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_76, %arg59, %arg58, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %37 = call @Unknown57(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> - %alloc_80 = memref.alloc() : memref<1x512xf16> - byre.compute @ReduceSumOp_f16_f16(%37, %alloc_80) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<1x512xf16> - %38 = call @Unknown58(%alloc_80) : (memref<1x512xf16>) -> memref<1x512xf16> - %39 = call @Unknown59(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16> - %alloc_81 = memref.alloc() : memref<512x1000xf16> - byre.compute @TransposeOp_f16_f16(%39, %alloc_81) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16> - %alloc_82 = memref.alloc() : memref<1x1000xf16> - byre.compute @MatmulOp_f16f16_f16(%38, %39, %alloc_82) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16> - %40 = call @Unknown60(%arg3, %alloc_82) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16> - %41 = call @Unknown61(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %42 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %43 = call @Unknown63(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %44 = call @Unknown64(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %45 = call @Unknown65(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %46 = call @Unknown66(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %47 = call @Unknown67(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %48 = call @Unknown68(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %49 = call @Unknown69(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %50 = call @Unknown70(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> - %51 = call @Unknown71(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %52 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %53 = call @Unknown73(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %54 = call @Unknown74(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %55 = call @Unknown75(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %56 = call @Unknown76(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %57 = call @Unknown77(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %58 = call @Unknown78(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %59 = call @Unknown79(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %60 = call @Unknown80(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> - %61 = call @Unknown81(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %62 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %63 = call @Unknown83(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %64 = call @Unknown84(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %65 = call @Unknown85(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %66 = call @Unknown86(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %67 = call @Unknown87(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %68 = call @Unknown88(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %69 = call @Unknown89(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %70 = call @Unknown90(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> - %71 = call @Unknown91(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %72 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %73 = call @Unknown93(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %74 = call @Unknown94(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %75 = call @Unknown95(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %76 = call @Unknown96(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %77 = call @Unknown97(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %78 = call @Unknown98(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %79 = call @Unknown99(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - %80 = call @Unknown100(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> - return %40, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %38, %alloc_81 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16> + %37 = call @Unknown51(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> + %38 = call @Unknown58(%37) : (memref<1x512x7x7xf16>) -> memref<1x512xf16> + %39 = call @Unknown59(%38) : (memref<1x512xf16>) -> memref<1x512xf16> + %40 = call @Unknown60(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16> + %alloc_80 = memref.alloc() : memref<512x1000xf16> + byre.compute @TransposeOp_f16_f16(%40, %alloc_80) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16> + %alloc_81 = memref.alloc() : memref<1x1000xf16> + byre.compute @MatmulOp_f16f16_f16(%39, %40, %alloc_81) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16> + %41 = call @Unknown61(%arg3, %alloc_81) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16> + %42 = call @Unknown62(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %43 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %44 = call @Unknown62(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %45 = call @Unknown62(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %46 = call @Unknown62(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %47 = call @Unknown62(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %48 = call @Unknown62(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %49 = call @Unknown62(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %50 = call @Unknown62(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %51 = call @Unknown62(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32> + %52 = call @Unknown72(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %53 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %54 = call @Unknown72(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %55 = call @Unknown72(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %56 = call @Unknown72(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %57 = call @Unknown72(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %58 = call @Unknown72(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %59 = call @Unknown72(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %60 = call @Unknown72(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %61 = call @Unknown72(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32> + %62 = call @Unknown82(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %63 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %64 = call @Unknown82(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %65 = call @Unknown82(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %66 = call @Unknown82(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %67 = call @Unknown82(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %68 = call @Unknown82(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %69 = call @Unknown82(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %70 = call @Unknown82(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %71 = call @Unknown82(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32> + %72 = call @Unknown92(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %73 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %74 = call @Unknown92(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %75 = call @Unknown92(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %76 = call @Unknown92(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %77 = call @Unknown92(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %78 = call @Unknown92(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %79 = call @Unknown92(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %80 = call @Unknown92(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + %81 = call @Unknown92(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32> + return %41, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %39, %alloc_80 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/FW/8_byre_opt.mlir b/compiler/test/E2E/ResNet18/FW/8_byre_opt.mlir index 5947f8979..a109ab380 100644 --- a/compiler/test/E2E/ResNet18/FW/8_byre_opt.mlir +++ b/compiler/test/E2E/ResNet18/FW/8_byre_opt.mlir @@ -4,2585 +4,775 @@ module attributes {gpu.container_module} { gpu.module @unified { - gpu.func @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index + gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c512 step %6 { + %7 = memref.load %arg1[%arg3] : memref<512xf32> + %8 = memref.load %arg0[%arg3] : memref<512xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<512xf32> } gpu.return } - gpu.func @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel { - %c0 = arith.constant 0 : index - %c1000 = arith.constant 1000 : index + gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg1[%c0, %4] : memref<1x1000xf16> - %7 = memref.load %arg0[%4] : memref<1000xf32> - %8 = arith.truncf %7 : f32 to f16 - %9 = arith.addf %6, %8 : f16 - memref.store %9, %arg2[%c0, %4] : memref<1x1000xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c256 step %6 { + %7 = memref.load %arg1[%arg3] : memref<256xf32> + %8 = memref.load %arg0[%arg3] : memref<256xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<256xf32> } gpu.return } - gpu.func @Unknown59(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { - %c0 = arith.constant 0 : index - %c512000 = arith.constant 512000 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c128 step %6 { + %7 = memref.load %arg1[%arg3] : memref<128xf32> + %8 = memref.load %arg0[%arg3] : memref<128xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<128xf32> } gpu.return } - gpu.func @Unknown58(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel { - %cst = arith.constant 2.040100e-02 : f16 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index + gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%c0, %4] : memref<1x512xf16> - %7 = arith.mulf %6, %cst : f16 - memref.store %7, %arg1[%c0, %4] : memref<1x512xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c64 step %6 { + %7 = memref.load %arg1[%arg3] : memref<64xf32> + %8 = memref.load %arg0[%arg3] : memref<64xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<64xf32> } gpu.return } - gpu.func @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 + gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel { %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg3] : memref<1000xf32> + %8 = memref.load %arg1[%c0, %arg3] : memref<1x1000xf16> + %9 = arith.truncf %7 : f32 to f16 + %10 = arith.addf %8, %9 : f16 + memref.store %10, %arg2[%c0, %arg3] : memref<1x1000xf16> } gpu.return } - gpu.func @Unknown55(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown60(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { + %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown54(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf16> } gpu.return } - gpu.func @Unknown52(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { + gpu.func @Unknown59(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel { + %cst = arith.constant 2.040100e-02 : f16 %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512 step %6 { + %7 = memref.load %arg0[%c0, %arg2] : memref<1x512xf16> + %8 = arith.mulf %7, %cst : f16 + memref.store %8, %arg1[%c0, %arg2] : memref<1x512xf16> } gpu.return } gpu.func @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown49(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16> } gpu.return } gpu.func @Unknown48(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown46(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16> } gpu.return } gpu.func @Unknown44(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c131072 = arith.constant 131072 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - } - gpu.return - } - gpu.func @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown41(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown40(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown38(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> } gpu.return } gpu.func @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown35(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16> } gpu.return } gpu.func @Unknown34(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg2, %c14 : index + %8 = arith.divsi %arg2, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown32(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c294912 = arith.constant 294912 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16> } gpu.return } gpu.func @Unknown30(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c32768 = arith.constant 32768 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - } - gpu.return - } - gpu.func @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown27(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown26(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown24(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> } gpu.return } gpu.func @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown21(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16> } gpu.return } gpu.func @Unknown20(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg2, %c28 : index + %8 = arith.divsi %arg2, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown18(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16> } gpu.return } gpu.func @Unknown16(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c8192 = arith.constant 8192 : index - %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - } - gpu.return - } - gpu.func @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown13(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown12(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown10(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> } gpu.return } gpu.func @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown7(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown6(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg2, %c56 : index + %8 = arith.divsi %arg2, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16> } gpu.return } gpu.func @Unknown3(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg2, %c112 : index + %8 = arith.divsi %arg2, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16> } gpu.return } gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { - %c0 = arith.constant 0 : index %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16> } gpu.return } gpu.func @Unknown0(%arg0: memref<1x3x224x224xf32>, %arg1: memref<1x3x224x224xf16>) kernel { - %c0 = arith.constant 0 : index %c150528 = arith.constant 150528 : index + %c0 = arith.constant 0 : index %c224 = arith.constant 224 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c150528 : index - scf.if %5 { - %6 = arith.remsi %4, %c224 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c224 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c224 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c224 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c224 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c224 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x3x224x224xf32> - %27 = arith.truncf %26 : f32 to f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x3x224x224xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c150528 step %6 { + %7 = arith.remsi %arg2, %c224 : index + %8 = arith.divsi %arg2, %c224 : index + %9 = arith.remsi %8, %c224 : index + %10 = arith.divsi %8, %c224 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x3x224x224xf32> + %12 = arith.truncf %11 : f32 to f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x3x224x224xf16> } gpu.return } + gpu.func @Unknown58_kernel(%arg0: memref<512x49xf16>, %arg1: memref<512xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c49 = arith.constant 49 : index + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.remsi %1, %c64 : index + %3 = arith.cmpi slt, %2, %c0 : index + %4 = arith.addi %2, %c64 : index + %5 = arith.select %3, %4, %2 : index + %6 = arith.cmpi slt, %5, %c49 : index + %7 = arith.select %6, %5, %c49 : index + %8 = arith.addi %5, %c1 : index + %9 = arith.cmpi slt, %8, %c49 : index + %10 = arith.select %9, %8, %c49 : index + %11 = arith.subi %10, %7 : index + %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %12 = arith.cmpi ugt, %11, %c0 : index + %13 = scf.if %12 -> (f16) { + %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %21 : f16 + } else { + scf.yield %cst : f16 + } + %14 = arith.addf %13, %cst : f16 + memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space> + %15 = arith.cmpi ult, %1, %c32 : index + scf.if %15 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space> + %16 = arith.cmpi ult, %1, %c16 : index + scf.if %16 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space> + %17 = arith.cmpi ult, %1, %c8 : index + scf.if %17 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space> + %18 = arith.cmpi ult, %1, %c4 : index + scf.if %18 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space> + %19 = arith.cmpi ult, %1, %c2 : index + scf.if %19 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %20 = arith.cmpi ult, %1, %c1 : index + scf.if %20 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %arg1[%0] : memref<512xf16> + } + gpu.barrier + gpu.return + } } - func.func private @Unknown0(memref<1x3x224x224xf32, "cuda">) -> memref<1x3x224x224xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1176 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown1(memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown3(memref<1x64x112x112xf16, "cuda">) -> memref<1x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown4(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown6(memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown7(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown9(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown10(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown10", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown12(memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown13(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown15(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown15", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown16(memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown18(memref<128x64x3x3xf32, "cuda">) -> memref<128x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown20(memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown21(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown23(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown24(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown26(memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown27(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown27", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown29(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown29", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown30(memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown32(memref<256x128x3x3xf32, "cuda">) -> memref<256x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown34(memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown34", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown35(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown37(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown38(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown40(memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown40", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown41(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown41", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown43(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown43", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown44(memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown46(memref<512x256x3x3xf32, "cuda">) -> memref<512x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown48(memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown49(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown49", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown51(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown51", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown52(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown52", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown54(memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown54", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown55(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown57(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown58(memref<1x512xf16, "cuda">) -> memref<1x512xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown58", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown59(memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown60(memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">) -> memref<1x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown61(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown62(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown63(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown64(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown65(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown65", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown66(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown66", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown67(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown67", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown68(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown69(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown70(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown70", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown71(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown71", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown72(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown73(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown74(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown75(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown75", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown76(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown76", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown77(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown78(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown79(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown80(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown81(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown81", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown82(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown83(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown84(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown85(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown86(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown87(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown88(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown88", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown89(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown90(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown91(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown92(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown93(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown94(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown95(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown96(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown97(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown97", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown98(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown98", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown99(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown99", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown100(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown100", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown0(memref<1x3x224x224xf32, "cuda">) -> memref<1x3x224x224xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 147 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown1(memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown3(memref<1x64x112x112xf16, "cuda">) -> memref<1x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown4(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown6(memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown9(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown16(memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown18(memref<128x64x3x3xf32, "cuda">) -> memref<128x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown20(memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown21(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown23(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown30(memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown32(memref<256x128x3x3xf32, "cuda">) -> memref<256x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown34(memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown34", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown35(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown37(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown44(memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown46(memref<512x256x3x3xf32, "cuda">) -> memref<512x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown48(memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown49(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown49", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown51(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown51", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown59(memref<1x512xf16, "cuda">) -> memref<1x512xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown60(memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown61(memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">) -> memref<1x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown62(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown72(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown82(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown92(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} func.func @main(%arg0: memref<64xf32, "cuda">, %arg1: memref<64xf32, "cuda">, %arg2: memref<64x3x7x7xf32, "cuda">, %arg3: memref<1000xf32, "cuda">, %arg4: memref<1000x512xf32, "cuda">, %arg5: memref<64xf32, "cuda">, %arg6: memref<64xf32, "cuda">, %arg7: memref<64xf32, "cuda">, %arg8: memref<64xf32, "cuda">, %arg9: memref<64x64x3x3xf32, "cuda">, %arg10: memref<64x64x3x3xf32, "cuda">, %arg11: memref<64xf32, "cuda">, %arg12: memref<64xf32, "cuda">, %arg13: memref<64xf32, "cuda">, %arg14: memref<64xf32, "cuda">, %arg15: memref<64x64x3x3xf32, "cuda">, %arg16: memref<64x64x3x3xf32, "cuda">, %arg17: memref<128xf32, "cuda">, %arg18: memref<128xf32, "cuda">, %arg19: memref<128xf32, "cuda">, %arg20: memref<128xf32, "cuda">, %arg21: memref<128x64x3x3xf32, "cuda">, %arg22: memref<128x128x3x3xf32, "cuda">, %arg23: memref<128x64x1x1xf32, "cuda">, %arg24: memref<128xf32, "cuda">, %arg25: memref<128xf32, "cuda">, %arg26: memref<128xf32, "cuda">, %arg27: memref<128xf32, "cuda">, %arg28: memref<128xf32, "cuda">, %arg29: memref<128xf32, "cuda">, %arg30: memref<128x128x3x3xf32, "cuda">, %arg31: memref<128x128x3x3xf32, "cuda">, %arg32: memref<256xf32, "cuda">, %arg33: memref<256xf32, "cuda">, %arg34: memref<256xf32, "cuda">, %arg35: memref<256xf32, "cuda">, %arg36: memref<256x128x3x3xf32, "cuda">, %arg37: memref<256x256x3x3xf32, "cuda">, %arg38: memref<256x128x1x1xf32, "cuda">, %arg39: memref<256xf32, "cuda">, %arg40: memref<256xf32, "cuda">, %arg41: memref<256xf32, "cuda">, %arg42: memref<256xf32, "cuda">, %arg43: memref<256xf32, "cuda">, %arg44: memref<256xf32, "cuda">, %arg45: memref<256x256x3x3xf32, "cuda">, %arg46: memref<256x256x3x3xf32, "cuda">, %arg47: memref<512xf32, "cuda">, %arg48: memref<512xf32, "cuda">, %arg49: memref<512xf32, "cuda">, %arg50: memref<512xf32, "cuda">, %arg51: memref<512x256x3x3xf32, "cuda">, %arg52: memref<512x512x3x3xf32, "cuda">, %arg53: memref<512x256x1x1xf32, "cuda">, %arg54: memref<512xf32, "cuda">, %arg55: memref<512xf32, "cuda">, %arg56: memref<512xf32, "cuda">, %arg57: memref<512xf32, "cuda">, %arg58: memref<512xf32, "cuda">, %arg59: memref<512xf32, "cuda">, %arg60: memref<512x512x3x3xf32, "cuda">, %arg61: memref<512x512x3x3xf32, "cuda">, %arg62: memref, %arg63: memref<64xf32, "cuda">, %arg64: memref<64xf32, "cuda">, %arg65: memref, %arg66: memref<64xf32, "cuda">, %arg67: memref<64xf32, "cuda">, %arg68: memref, %arg69: memref<64xf32, "cuda">, %arg70: memref<64xf32, "cuda">, %arg71: memref, %arg72: memref<64xf32, "cuda">, %arg73: memref<64xf32, "cuda">, %arg74: memref, %arg75: memref<64xf32, "cuda">, %arg76: memref<64xf32, "cuda">, %arg77: memref, %arg78: memref<128xf32, "cuda">, %arg79: memref<128xf32, "cuda">, %arg80: memref, %arg81: memref<128xf32, "cuda">, %arg82: memref<128xf32, "cuda">, %arg83: memref, %arg84: memref<128xf32, "cuda">, %arg85: memref<128xf32, "cuda">, %arg86: memref, %arg87: memref<128xf32, "cuda">, %arg88: memref<128xf32, "cuda">, %arg89: memref, %arg90: memref<128xf32, "cuda">, %arg91: memref<128xf32, "cuda">, %arg92: memref, %arg93: memref<256xf32, "cuda">, %arg94: memref<256xf32, "cuda">, %arg95: memref, %arg96: memref<256xf32, "cuda">, %arg97: memref<256xf32, "cuda">, %arg98: memref, %arg99: memref<256xf32, "cuda">, %arg100: memref<256xf32, "cuda">, %arg101: memref, %arg102: memref<256xf32, "cuda">, %arg103: memref<256xf32, "cuda">, %arg104: memref, %arg105: memref<256xf32, "cuda">, %arg106: memref<256xf32, "cuda">, %arg107: memref, %arg108: memref<512xf32, "cuda">, %arg109: memref<512xf32, "cuda">, %arg110: memref, %arg111: memref<512xf32, "cuda">, %arg112: memref<512xf32, "cuda">, %arg113: memref, %arg114: memref<512xf32, "cuda">, %arg115: memref<512xf32, "cuda">, %arg116: memref, %arg117: memref<512xf32, "cuda">, %arg118: memref<512xf32, "cuda">, %arg119: memref, %arg120: memref<512xf32, "cuda">, %arg121: memref<512xf32, "cuda">, %arg122: memref<1x3x224x224xf32, "cuda">) -> (memref<1x1000xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda">, memref<512x1000xf16, "cuda">) attributes {__placeholder__byre.entry_point} { %0 = call @Unknown0(%arg122) : (memref<1x3x224x224xf32, "cuda">) -> memref<1x3x224x224xf16, "cuda"> %1 = call @Unknown1(%arg2) : (memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda"> @@ -2603,7 +793,7 @@ module attributes {gpu.container_module} { %alloc_7 = memref.alloc() : memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_4, %arg6, %arg5, %alloc_5, %alloc_6, %alloc_7) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> %4 = call @Unknown6(%alloc_5) : (memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> - %5 = call @Unknown7(%arg10) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %5 = call @Unknown4(%arg10) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> %alloc_8 = memref.alloc() : memref<1x64x56x56xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%4, %5, %alloc_8) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> %alloc_9 = memref.alloc() : memref<1x64x56x56xf16, "cuda"> @@ -2611,22 +801,22 @@ module attributes {gpu.container_module} { %alloc_11 = memref.alloc() : memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_8, %arg8, %arg7, %alloc_9, %alloc_10, %alloc_11) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> %6 = call @Unknown9(%alloc_9, %alloc_3) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> - %7 = call @Unknown10(%arg15) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %7 = call @Unknown4(%arg15) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> %alloc_12 = memref.alloc() : memref<1x64x56x56xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%6, %7, %alloc_12) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> %alloc_13 = memref.alloc() : memref<1x64x56x56xf16, "cuda"> %alloc_14 = memref.alloc() : memref<64xf32, "cuda"> %alloc_15 = memref.alloc() : memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_12, %arg12, %arg11, %alloc_13, %alloc_14, %alloc_15) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %8 = call @Unknown12(%alloc_13) : (memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> - %9 = call @Unknown13(%arg16) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %8 = call @Unknown6(%alloc_13) : (memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %9 = call @Unknown4(%arg16) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> %alloc_16 = memref.alloc() : memref<1x64x56x56xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%8, %9, %alloc_16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> %alloc_17 = memref.alloc() : memref<1x64x56x56xf16, "cuda"> %alloc_18 = memref.alloc() : memref<64xf32, "cuda"> %alloc_19 = memref.alloc() : memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_16, %arg14, %arg13, %alloc_17, %alloc_18, %alloc_19) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %10 = call @Unknown15(%alloc_17, %6) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %10 = call @Unknown9(%alloc_17, %6) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> %11 = call @Unknown16(%arg23) : (memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda"> %alloc_20 = memref.alloc() : memref<1x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%10, %11, %alloc_20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> @@ -2650,22 +840,22 @@ module attributes {gpu.container_module} { %alloc_31 = memref.alloc() : memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_28, %arg20, %arg19, %alloc_29, %alloc_30, %alloc_31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> %15 = call @Unknown23(%alloc_29, %alloc_21) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - %16 = call @Unknown24(%arg30) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %16 = call @Unknown21(%arg30) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> %alloc_32 = memref.alloc() : memref<1x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%15, %16, %alloc_32) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> %alloc_33 = memref.alloc() : memref<1x128x28x28xf16, "cuda"> %alloc_34 = memref.alloc() : memref<128xf32, "cuda"> %alloc_35 = memref.alloc() : memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_32, %arg27, %arg26, %alloc_33, %alloc_34, %alloc_35) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %17 = call @Unknown26(%alloc_33) : (memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - %18 = call @Unknown27(%arg31) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %17 = call @Unknown20(%alloc_33) : (memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %18 = call @Unknown21(%arg31) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> %alloc_36 = memref.alloc() : memref<1x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%17, %18, %alloc_36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> %alloc_37 = memref.alloc() : memref<1x128x28x28xf16, "cuda"> %alloc_38 = memref.alloc() : memref<128xf32, "cuda"> %alloc_39 = memref.alloc() : memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_36, %arg29, %arg28, %alloc_37, %alloc_38, %alloc_39) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %19 = call @Unknown29(%alloc_37, %15) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %19 = call @Unknown23(%alloc_37, %15) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> %20 = call @Unknown30(%arg38) : (memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda"> %alloc_40 = memref.alloc() : memref<1x256x14x14xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%19, %20, %alloc_40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> @@ -2689,22 +879,22 @@ module attributes {gpu.container_module} { %alloc_51 = memref.alloc() : memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_48, %arg35, %arg34, %alloc_49, %alloc_50, %alloc_51) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> %24 = call @Unknown37(%alloc_49, %alloc_41) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - %25 = call @Unknown38(%arg45) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + %25 = call @Unknown35(%arg45) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> %alloc_52 = memref.alloc() : memref<1x256x14x14xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%24, %25, %alloc_52) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> %alloc_53 = memref.alloc() : memref<1x256x14x14xf16, "cuda"> %alloc_54 = memref.alloc() : memref<256xf32, "cuda"> %alloc_55 = memref.alloc() : memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_52, %arg42, %arg41, %alloc_53, %alloc_54, %alloc_55) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %26 = call @Unknown40(%alloc_53) : (memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - %27 = call @Unknown41(%arg46) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + %26 = call @Unknown34(%alloc_53) : (memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %27 = call @Unknown35(%arg46) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> %alloc_56 = memref.alloc() : memref<1x256x14x14xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%26, %27, %alloc_56) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> %alloc_57 = memref.alloc() : memref<1x256x14x14xf16, "cuda"> %alloc_58 = memref.alloc() : memref<256xf32, "cuda"> %alloc_59 = memref.alloc() : memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_56, %arg44, %arg43, %alloc_57, %alloc_58, %alloc_59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %28 = call @Unknown43(%alloc_57, %24) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %28 = call @Unknown37(%alloc_57, %24) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> %29 = call @Unknown44(%arg53) : (memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda"> %alloc_60 = memref.alloc() : memref<1x512x7x7xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%28, %29, %alloc_60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> @@ -2728,71 +918,73 @@ module attributes {gpu.container_module} { %alloc_71 = memref.alloc() : memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_68, %arg50, %arg49, %alloc_69, %alloc_70, %alloc_71) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> %33 = call @Unknown51(%alloc_69, %alloc_61) : (memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - %34 = call @Unknown52(%arg60) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %34 = call @Unknown49(%arg60) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> %alloc_72 = memref.alloc() : memref<1x512x7x7xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%33, %34, %alloc_72) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> %alloc_73 = memref.alloc() : memref<1x512x7x7xf16, "cuda"> %alloc_74 = memref.alloc() : memref<512xf32, "cuda"> %alloc_75 = memref.alloc() : memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_72, %arg57, %arg56, %alloc_73, %alloc_74, %alloc_75) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %35 = call @Unknown54(%alloc_73) : (memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - %36 = call @Unknown55(%arg61) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %35 = call @Unknown48(%alloc_73) : (memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %36 = call @Unknown49(%arg61) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> %alloc_76 = memref.alloc() : memref<1x512x7x7xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%35, %36, %alloc_76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> %alloc_77 = memref.alloc() : memref<1x512x7x7xf16, "cuda"> %alloc_78 = memref.alloc() : memref<512xf32, "cuda"> %alloc_79 = memref.alloc() : memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_76, %arg59, %arg58, %alloc_77, %alloc_78, %alloc_79) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - %37 = call @Unknown57(%alloc_77, %33) : (memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - %alloc_80 = memref.alloc() : memref<1x512xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%37, %alloc_80) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda"> - %38 = call @Unknown58(%alloc_80) : (memref<1x512xf16, "cuda">) -> memref<1x512xf16, "cuda"> - %39 = call @Unknown59(%arg4) : (memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> + %37 = call @Unknown51(%alloc_77, %33) : (memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %collapse_shape = memref.collapse_shape %37 [[0, 1], [2, 3]] : memref<1x512x7x7xf16, "cuda"> into memref<512x49xf16, "cuda"> + %alloc_80 = memref.alloc() : memref<512xf16, "cuda"> + byre.compute @PTXOp(%collapse_shape, %alloc_80) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 512 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown58_kernel"} : memref<512x49xf16, "cuda">, memref<512xf16, "cuda"> + %expand_shape = memref.expand_shape %alloc_80 [[0, 1]] : memref<512xf16, "cuda"> into memref<1x512xf16, "cuda"> + %38 = call @Unknown59(%expand_shape) : (memref<1x512xf16, "cuda">) -> memref<1x512xf16, "cuda"> + %39 = call @Unknown60(%arg4) : (memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> %alloc_81 = memref.alloc() : memref<512x1000xf16, "cuda"> byre.compute @TransposeOp_f16_f16(%39, %alloc_81) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda"> %alloc_82 = memref.alloc() : memref<1x1000xf16, "cuda"> byre.compute @MatmulOp_f16f16_f16(%38, %39, %alloc_82) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda"> - %40 = call @Unknown60(%arg3, %alloc_82) : (memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">) -> memref<1x1000xf16, "cuda"> - %41 = call @Unknown61(%alloc_1, %arg63) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> + %40 = call @Unknown61(%arg3, %alloc_82) : (memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">) -> memref<1x1000xf16, "cuda"> + %41 = call @Unknown62(%alloc_1, %arg63) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> %42 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> - %43 = call @Unknown63(%alloc_6, %arg66) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> - %44 = call @Unknown64(%alloc_7, %arg67) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> - %45 = call @Unknown65(%alloc_10, %arg69) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> - %46 = call @Unknown66(%alloc_11, %arg70) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> - %47 = call @Unknown67(%alloc_14, %arg72) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> - %48 = call @Unknown68(%alloc_15, %arg73) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> - %49 = call @Unknown69(%alloc_18, %arg75) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> - %50 = call @Unknown70(%alloc_19, %arg76) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> - %51 = call @Unknown71(%alloc_26, %arg78) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> + %43 = call @Unknown62(%alloc_6, %arg66) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> + %44 = call @Unknown62(%alloc_7, %arg67) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> + %45 = call @Unknown62(%alloc_10, %arg69) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> + %46 = call @Unknown62(%alloc_11, %arg70) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> + %47 = call @Unknown62(%alloc_14, %arg72) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> + %48 = call @Unknown62(%alloc_15, %arg73) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> + %49 = call @Unknown62(%alloc_18, %arg75) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> + %50 = call @Unknown62(%alloc_19, %arg76) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> + %51 = call @Unknown72(%alloc_26, %arg78) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> %52 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> - %53 = call @Unknown73(%alloc_30, %arg81) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> - %54 = call @Unknown74(%alloc_31, %arg82) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> - %55 = call @Unknown75(%alloc_22, %arg84) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> - %56 = call @Unknown76(%alloc_23, %arg85) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> - %57 = call @Unknown77(%alloc_34, %arg87) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> - %58 = call @Unknown78(%alloc_35, %arg88) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> - %59 = call @Unknown79(%alloc_38, %arg90) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> - %60 = call @Unknown80(%alloc_39, %arg91) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> - %61 = call @Unknown81(%alloc_46, %arg93) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> + %53 = call @Unknown72(%alloc_30, %arg81) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> + %54 = call @Unknown72(%alloc_31, %arg82) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> + %55 = call @Unknown72(%alloc_22, %arg84) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> + %56 = call @Unknown72(%alloc_23, %arg85) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> + %57 = call @Unknown72(%alloc_34, %arg87) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> + %58 = call @Unknown72(%alloc_35, %arg88) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> + %59 = call @Unknown72(%alloc_38, %arg90) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> + %60 = call @Unknown72(%alloc_39, %arg91) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> + %61 = call @Unknown82(%alloc_46, %arg93) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> %62 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> - %63 = call @Unknown83(%alloc_50, %arg96) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> - %64 = call @Unknown84(%alloc_51, %arg97) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> - %65 = call @Unknown85(%alloc_42, %arg99) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> - %66 = call @Unknown86(%alloc_43, %arg100) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> - %67 = call @Unknown87(%alloc_54, %arg102) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> - %68 = call @Unknown88(%alloc_55, %arg103) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> - %69 = call @Unknown89(%alloc_58, %arg105) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> - %70 = call @Unknown90(%alloc_59, %arg106) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> - %71 = call @Unknown91(%alloc_66, %arg108) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> + %63 = call @Unknown82(%alloc_50, %arg96) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> + %64 = call @Unknown82(%alloc_51, %arg97) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> + %65 = call @Unknown82(%alloc_42, %arg99) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> + %66 = call @Unknown82(%alloc_43, %arg100) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> + %67 = call @Unknown82(%alloc_54, %arg102) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> + %68 = call @Unknown82(%alloc_55, %arg103) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> + %69 = call @Unknown82(%alloc_58, %arg105) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> + %70 = call @Unknown82(%alloc_59, %arg106) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> + %71 = call @Unknown92(%alloc_66, %arg108) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> %72 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> - %73 = call @Unknown93(%alloc_70, %arg111) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> - %74 = call @Unknown94(%alloc_71, %arg112) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> - %75 = call @Unknown95(%alloc_62, %arg114) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> - %76 = call @Unknown96(%alloc_63, %arg115) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> - %77 = call @Unknown97(%alloc_74, %arg117) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> - %78 = call @Unknown98(%alloc_75, %arg118) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> - %79 = call @Unknown99(%alloc_78, %arg120) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> - %80 = call @Unknown100(%alloc_79, %arg121) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> + %73 = call @Unknown92(%alloc_70, %arg111) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> + %74 = call @Unknown92(%alloc_71, %arg112) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> + %75 = call @Unknown92(%alloc_62, %arg114) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> + %76 = call @Unknown92(%alloc_63, %arg115) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> + %77 = call @Unknown92(%alloc_74, %arg117) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> + %78 = call @Unknown92(%alloc_75, %arg118) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> + %79 = call @Unknown92(%alloc_78, %arg120) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> + %80 = call @Unknown92(%alloc_79, %arg121) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> return %40, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %38, %alloc_81 : memref<1x1000xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda">, memref<512x1000xf16, "cuda"> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/FW/9a_byre_host.mlir b/compiler/test/E2E/ResNet18/FW/9a_byre_host.mlir index 0e7edc2f2..9928b1165 100644 --- a/compiler/test/E2E/ResNet18/FW/9a_byre_host.mlir +++ b/compiler/test/E2E/ResNet18/FW/9a_byre_host.mlir @@ -4,2682 +4,927 @@ module attributes {byre.container_module, gpu.container_module} { gpu.module @unified { - gpu.func @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index + gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c512 step %6 { + %7 = memref.load %arg1[%arg3] : memref<512xf32> + %8 = memref.load %arg0[%arg3] : memref<512xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<512xf32> } gpu.return } - gpu.func @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel { - %c0 = arith.constant 0 : index - %c1000 = arith.constant 1000 : index + gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg1[%c0, %4] : memref<1x1000xf16> - %7 = memref.load %arg0[%4] : memref<1000xf32> - %8 = arith.truncf %7 : f32 to f16 - %9 = arith.addf %6, %8 : f16 - memref.store %9, %arg2[%c0, %4] : memref<1x1000xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c256 step %6 { + %7 = memref.load %arg1[%arg3] : memref<256xf32> + %8 = memref.load %arg0[%arg3] : memref<256xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<256xf32> } gpu.return } - gpu.func @Unknown59(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { - %c0 = arith.constant 0 : index - %c512000 = arith.constant 512000 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c128 step %6 { + %7 = memref.load %arg1[%arg3] : memref<128xf32> + %8 = memref.load %arg0[%arg3] : memref<128xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<128xf32> } gpu.return } - gpu.func @Unknown58(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel { - %cst = arith.constant 2.040100e-02 : f16 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index + gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%c0, %4] : memref<1x512xf16> - %7 = arith.mulf %6, %cst : f16 - memref.store %7, %arg1[%c0, %4] : memref<1x512xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c64 step %6 { + %7 = memref.load %arg1[%arg3] : memref<64xf32> + %8 = memref.load %arg0[%arg3] : memref<64xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<64xf32> } gpu.return } - gpu.func @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 + gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel { %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg3] : memref<1000xf32> + %8 = memref.load %arg1[%c0, %arg3] : memref<1x1000xf16> + %9 = arith.truncf %7 : f32 to f16 + %10 = arith.addf %8, %9 : f16 + memref.store %10, %arg2[%c0, %arg3] : memref<1x1000xf16> } gpu.return } - gpu.func @Unknown55(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown60(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { + %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown54(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf16> } gpu.return } - gpu.func @Unknown52(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { + gpu.func @Unknown59(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel { + %cst = arith.constant 2.040100e-02 : f16 %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512 step %6 { + %7 = memref.load %arg0[%c0, %arg2] : memref<1x512xf16> + %8 = arith.mulf %7, %cst : f16 + memref.store %8, %arg1[%c0, %arg2] : memref<1x512xf16> } gpu.return } gpu.func @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown49(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16> } gpu.return } gpu.func @Unknown48(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown46(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16> } gpu.return } gpu.func @Unknown44(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c131072 = arith.constant 131072 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - } - gpu.return - } - gpu.func @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown41(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown40(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown38(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> } gpu.return } gpu.func @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown35(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16> } gpu.return } gpu.func @Unknown34(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg2, %c14 : index + %8 = arith.divsi %arg2, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown32(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c294912 = arith.constant 294912 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16> } gpu.return } gpu.func @Unknown30(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c32768 = arith.constant 32768 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - } - gpu.return - } - gpu.func @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown27(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown26(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown24(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> } gpu.return } gpu.func @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown21(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16> } gpu.return } gpu.func @Unknown20(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg2, %c28 : index + %8 = arith.divsi %arg2, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown18(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16> } gpu.return } gpu.func @Unknown16(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c8192 = arith.constant 8192 : index - %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - } - gpu.return - } - gpu.func @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown13(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown12(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown10(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> } gpu.return } gpu.func @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown7(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown6(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg2, %c56 : index + %8 = arith.divsi %arg2, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16> } gpu.return } gpu.func @Unknown3(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg2, %c112 : index + %8 = arith.divsi %arg2, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16> } gpu.return } gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { - %c0 = arith.constant 0 : index %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16> } gpu.return } gpu.func @Unknown0(%arg0: memref<1x3x224x224xf32>, %arg1: memref<1x3x224x224xf16>) kernel { - %c0 = arith.constant 0 : index %c150528 = arith.constant 150528 : index + %c0 = arith.constant 0 : index %c224 = arith.constant 224 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c150528 : index - scf.if %5 { - %6 = arith.remsi %4, %c224 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c224 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c224 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c224 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c224 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c224 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x3x224x224xf32> - %27 = arith.truncf %26 : f32 to f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x3x224x224xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c150528 step %6 { + %7 = arith.remsi %arg2, %c224 : index + %8 = arith.divsi %arg2, %c224 : index + %9 = arith.remsi %8, %c224 : index + %10 = arith.divsi %8, %c224 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x3x224x224xf32> + %12 = arith.truncf %11 : f32 to f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x3x224x224xf16> } gpu.return } + gpu.func @Unknown58_kernel(%arg0: memref<512x49xf16>, %arg1: memref<512xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c49 = arith.constant 49 : index + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.remsi %1, %c64 : index + %3 = arith.cmpi slt, %2, %c0 : index + %4 = arith.addi %2, %c64 : index + %5 = arith.select %3, %4, %2 : index + %6 = arith.cmpi slt, %5, %c49 : index + %7 = arith.select %6, %5, %c49 : index + %8 = arith.addi %5, %c1 : index + %9 = arith.cmpi slt, %8, %c49 : index + %10 = arith.select %9, %8, %c49 : index + %11 = arith.subi %10, %7 : index + %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %12 = arith.cmpi ugt, %11, %c0 : index + %13 = scf.if %12 -> (f16) { + %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %21 : f16 + } else { + scf.yield %cst : f16 + } + %14 = arith.addf %13, %cst : f16 + memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space> + %15 = arith.cmpi ult, %1, %c32 : index + scf.if %15 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space> + %16 = arith.cmpi ult, %1, %c16 : index + scf.if %16 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space> + %17 = arith.cmpi ult, %1, %c8 : index + scf.if %17 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space> + %18 = arith.cmpi ult, %1, %c4 : index + scf.if %18 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space> + %19 = arith.cmpi ult, %1, %c2 : index + scf.if %19 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %20 = arith.cmpi ult, %1, %c1 : index + scf.if %20 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %arg1[%0] : memref<512xf16> + } + gpu.barrier + gpu.return + } } func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<1000xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<1000x512xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<128xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<128xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<128xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<128xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<256xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<256xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<256xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<256xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<256xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<256xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<256xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<256xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<256xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<256xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<512xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<512xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<512xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<512xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<512xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<512xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<512xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<512xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<512xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<512xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<64xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<64xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<64xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<64xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<64xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<64xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<64xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<64xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<64xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<64xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<128xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<128xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<128xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<128xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<128xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<128xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<128xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<128xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<128xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<128xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<256xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<256xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<256xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<256xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<256xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<256xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<256xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<256xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<256xf32, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<256xf32, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<512xf32, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<512xf32, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<512xf32, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<512xf32, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<512xf32, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<512xf32, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<512xf32, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<512xf32, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<512xf32, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<512xf32, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<1x3x224x224xf32, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x1000xf16, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg124: memref<64xf32, "cuda"> {byre.arg_alias_index = 0 : i64, byre.argname = "Output1", byre.argtype = 2 : i32}, %arg125: memref<64xf32, "cuda"> {byre.arg_alias_index = 1 : i64, byre.argname = "Output2", byre.argtype = 2 : i32}, %arg126: memref<64xf32, "cuda"> {byre.arg_alias_index = 5 : i64, byre.argname = "Output3", byre.argtype = 2 : i32}, %arg127: memref<64xf32, "cuda"> {byre.arg_alias_index = 6 : i64, byre.argname = "Output4", byre.argtype = 2 : i32}, %arg128: memref<64xf32, "cuda"> {byre.arg_alias_index = 7 : i64, byre.argname = "Output5", byre.argtype = 2 : i32}, %arg129: memref<64xf32, "cuda"> {byre.arg_alias_index = 8 : i64, byre.argname = "Output6", byre.argtype = 2 : i32}, %arg130: memref<64xf32, "cuda"> {byre.arg_alias_index = 11 : i64, byre.argname = "Output7", byre.argtype = 2 : i32}, %arg131: memref<64xf32, "cuda"> {byre.arg_alias_index = 12 : i64, byre.argname = "Output8", byre.argtype = 2 : i32}, %arg132: memref<64xf32, "cuda"> {byre.arg_alias_index = 13 : i64, byre.argname = "Output9", byre.argtype = 2 : i32}, %arg133: memref<64xf32, "cuda"> {byre.arg_alias_index = 14 : i64, byre.argname = "Output10", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.arg_alias_index = 17 : i64, byre.argname = "Output11", byre.argtype = 2 : i32}, %arg135: memref<128xf32, "cuda"> {byre.arg_alias_index = 18 : i64, byre.argname = "Output12", byre.argtype = 2 : i32}, %arg136: memref<128xf32, "cuda"> {byre.arg_alias_index = 19 : i64, byre.argname = "Output13", byre.argtype = 2 : i32}, %arg137: memref<128xf32, "cuda"> {byre.arg_alias_index = 20 : i64, byre.argname = "Output14", byre.argtype = 2 : i32}, %arg138: memref<128xf32, "cuda"> {byre.arg_alias_index = 24 : i64, byre.argname = "Output15", byre.argtype = 2 : i32}, %arg139: memref<128xf32, "cuda"> {byre.arg_alias_index = 25 : i64, byre.argname = "Output16", byre.argtype = 2 : i32}, %arg140: memref<128xf32, "cuda"> {byre.arg_alias_index = 26 : i64, byre.argname = "Output17", byre.argtype = 2 : i32}, %arg141: memref<128xf32, "cuda"> {byre.arg_alias_index = 27 : i64, byre.argname = "Output18", byre.argtype = 2 : i32}, %arg142: memref<128xf32, "cuda"> {byre.arg_alias_index = 28 : i64, byre.argname = "Output19", byre.argtype = 2 : i32}, %arg143: memref<128xf32, "cuda"> {byre.arg_alias_index = 29 : i64, byre.argname = "Output20", byre.argtype = 2 : i32}, %arg144: memref<256xf32, "cuda"> {byre.arg_alias_index = 32 : i64, byre.argname = "Output21", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.arg_alias_index = 33 : i64, byre.argname = "Output22", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.arg_alias_index = 34 : i64, byre.argname = "Output23", byre.argtype = 2 : i32}, %arg147: memref<256xf32, "cuda"> {byre.arg_alias_index = 35 : i64, byre.argname = "Output24", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.arg_alias_index = 39 : i64, byre.argname = "Output25", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.arg_alias_index = 40 : i64, byre.argname = "Output26", byre.argtype = 2 : i32}, %arg150: memref<256xf32, "cuda"> {byre.arg_alias_index = 41 : i64, byre.argname = "Output27", byre.argtype = 2 : i32}, %arg151: memref<256xf32, "cuda"> {byre.arg_alias_index = 42 : i64, byre.argname = "Output28", byre.argtype = 2 : i32}, %arg152: memref<256xf32, "cuda"> {byre.arg_alias_index = 43 : i64, byre.argname = "Output29", byre.argtype = 2 : i32}, %arg153: memref<256xf32, "cuda"> {byre.arg_alias_index = 44 : i64, byre.argname = "Output30", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.arg_alias_index = 47 : i64, byre.argname = "Output31", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.arg_alias_index = 48 : i64, byre.argname = "Output32", byre.argtype = 2 : i32}, %arg156: memref<512xf32, "cuda"> {byre.arg_alias_index = 49 : i64, byre.argname = "Output33", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.arg_alias_index = 50 : i64, byre.argname = "Output34", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.arg_alias_index = 54 : i64, byre.argname = "Output35", byre.argtype = 2 : i32}, %arg159: memref<512xf32, "cuda"> {byre.arg_alias_index = 55 : i64, byre.argname = "Output36", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.arg_alias_index = 56 : i64, byre.argname = "Output37", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.arg_alias_index = 57 : i64, byre.argname = "Output38", byre.argtype = 2 : i32}, %arg162: memref<512xf32, "cuda"> {byre.arg_alias_index = 58 : i64, byre.argname = "Output39", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.arg_alias_index = 59 : i64, byre.argname = "Output40", byre.argtype = 2 : i32}, %arg164: memref<64xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg165: memref<64xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg166: memref<64xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg167: memref<64xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg168: memref<64xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg169: memref<64xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg170: memref<64xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg171: memref<64xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg172: memref<64xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg173: memref<64xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg174: memref<128xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg175: memref<128xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg176: memref<128xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg177: memref<128xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg178: memref<128xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg179: memref<128xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg180: memref<128xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg181: memref<128xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg182: memref<128xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg183: memref<128xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output63", byre.argtype = 2 : i32}, %arg187: memref<256xf32, "cuda"> {byre.argname = "Output64", byre.argtype = 2 : i32}, %arg188: memref<256xf32, "cuda"> {byre.argname = "Output65", byre.argtype = 2 : i32}, %arg189: memref<256xf32, "cuda"> {byre.argname = "Output66", byre.argtype = 2 : i32}, %arg190: memref<256xf32, "cuda"> {byre.argname = "Output67", byre.argtype = 2 : i32}, %arg191: memref<256xf32, "cuda"> {byre.argname = "Output68", byre.argtype = 2 : i32}, %arg192: memref<256xf32, "cuda"> {byre.argname = "Output69", byre.argtype = 2 : i32}, %arg193: memref<256xf32, "cuda"> {byre.argname = "Output70", byre.argtype = 2 : i32}, %arg194: memref<512xf32, "cuda"> {byre.argname = "Output71", byre.argtype = 2 : i32}, %arg195: memref<512xf32, "cuda"> {byre.argname = "Output72", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output73", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output74", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output75", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output76", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output77", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output78", byre.argtype = 2 : i32}, %arg202: memref<512xf32, "cuda"> {byre.argname = "Output79", byre.argtype = 2 : i32}, %arg203: memref<512xf32, "cuda"> {byre.argname = "Output80", byre.argtype = 2 : i32}, %arg204: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Output81", byre.argtype = 2 : i32}, %arg205: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Output82", byre.argtype = 2 : i32}, %arg206: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output83", byre.argtype = 2 : i32}, %arg207: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output84", byre.argtype = 2 : i32}, %arg208: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output85", byre.argtype = 2 : i32}, %arg209: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output86", byre.argtype = 2 : i32}, %arg210: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output87", byre.argtype = 2 : i32}, %arg211: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output88", byre.argtype = 2 : i32}, %arg212: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output89", byre.argtype = 2 : i32}, %arg213: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output90", byre.argtype = 2 : i32}, %arg214: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output91", byre.argtype = 2 : i32}, %arg215: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output92", byre.argtype = 2 : i32}, %arg216: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output93", byre.argtype = 2 : i32}, %arg217: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output94", byre.argtype = 2 : i32}, %arg218: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output95", byre.argtype = 2 : i32}, %arg219: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output96", byre.argtype = 2 : i32}, %arg220: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output97", byre.argtype = 2 : i32}, %arg221: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Output98", byre.argtype = 2 : i32}, %arg222: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output99", byre.argtype = 2 : i32}, %arg223: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output100", byre.argtype = 2 : i32}, %arg224: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output101", byre.argtype = 2 : i32}, %arg225: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output102", byre.argtype = 2 : i32}, %arg226: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Output103", byre.argtype = 2 : i32}, %arg227: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output104", byre.argtype = 2 : i32}, %arg228: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output105", byre.argtype = 2 : i32}, %arg229: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output106", byre.argtype = 2 : i32}, %arg230: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output107", byre.argtype = 2 : i32}, %arg231: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output108", byre.argtype = 2 : i32}, %arg232: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output109", byre.argtype = 2 : i32}, %arg233: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output110", byre.argtype = 2 : i32}, %arg234: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output111", byre.argtype = 2 : i32}, %arg235: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Output112", byre.argtype = 2 : i32}, %arg236: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output113", byre.argtype = 2 : i32}, %arg237: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output114", byre.argtype = 2 : i32}, %arg238: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output115", byre.argtype = 2 : i32}, %arg239: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output116", byre.argtype = 2 : i32}, %arg240: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Output117", byre.argtype = 2 : i32}, %arg241: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output118", byre.argtype = 2 : i32}, %arg242: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output119", byre.argtype = 2 : i32}, %arg243: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output120", byre.argtype = 2 : i32}, %arg244: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output121", byre.argtype = 2 : i32}, %arg245: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output122", byre.argtype = 2 : i32}, %arg246: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output123", byre.argtype = 2 : i32}, %arg247: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output124", byre.argtype = 2 : i32}, %arg248: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output125", byre.argtype = 2 : i32}, %arg249: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Output126", byre.argtype = 2 : i32}, %arg250: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output127", byre.argtype = 2 : i32}, %arg251: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output128", byre.argtype = 2 : i32}, %arg252: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output129", byre.argtype = 2 : i32}, %arg253: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output130", byre.argtype = 2 : i32}, %arg254: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Output131", byre.argtype = 2 : i32}, %arg255: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output132", byre.argtype = 2 : i32}, %arg256: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output133", byre.argtype = 2 : i32}, %arg257: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output134", byre.argtype = 2 : i32}, %arg258: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output135", byre.argtype = 2 : i32}, %arg259: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output136", byre.argtype = 2 : i32}, %arg260: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output137", byre.argtype = 2 : i32}, %arg261: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output138", byre.argtype = 2 : i32}, %arg262: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output139", byre.argtype = 2 : i32}, %arg263: memref<1x512xf16, "cuda"> {byre.argname = "Output140", byre.argtype = 2 : i32}, %arg264: memref<512x1000xf16, "cuda"> {byre.argname = "Output141", byre.argtype = 2 : i32}) attributes {byre.entry_point} { %alloc = memref.alloc() : memref<1838592xi8, "cuda"> - byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 128 : i32, GridSize.x = 1176 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda"> - byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 256 : i32, GridSize.x = 147 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda"> + byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg205, %arg204, %arg206) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> - %0 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> - %1 = "byre.alias"(%alloc) {offset = 7424 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %2 = "byre.alias"(%alloc) {offset = 7168 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 7424 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 7168 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg206, %arg1, %arg0, %0, %1, %2) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> byre.compute @PoolMaxOp_f16_f16(%arg207, %arg208) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg208, %arg209, %arg210) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %3 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> - %4 = "byre.alias"(%alloc) {offset = 6912 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %5 = "byre.alias"(%alloc) {offset = 6656 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %3 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %4 = "byre.alias"(%alloc) <{offset = 6912 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %5 = "byre.alias"(%alloc) <{offset = 6656 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg210, %arg6, %arg5, %3, %4, %5) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg211, %arg212, %arg213) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %6 = "byre.alias"(%alloc) {offset = 6400 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %7 = "byre.alias"(%alloc) {offset = 6144 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %6 = "byre.alias"(%alloc) <{offset = 6400 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %7 = "byre.alias"(%alloc) <{offset = 6144 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg213, %arg8, %arg7, %3, %6, %7) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg214, %arg215, %arg216) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %8 = "byre.alias"(%alloc) {offset = 5888 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %9 = "byre.alias"(%alloc) {offset = 5632 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %8 = "byre.alias"(%alloc) <{offset = 5888 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %9 = "byre.alias"(%alloc) <{offset = 5632 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg216, %arg12, %arg11, %3, %8, %9) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg217, %arg218, %arg219) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %10 = "byre.alias"(%alloc) {offset = 5376 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %11 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %10 = "byre.alias"(%alloc) <{offset = 5376 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %11 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg219, %arg14, %arg13, %3, %10, %11) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown15", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> + byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg220, %arg226, %arg227) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %12 = "byre.alias"(%alloc) {offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - %13 = "byre.alias"(%alloc) {offset = 256 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %14 = "byre.alias"(%alloc) {offset = 768 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %12 = "byre.alias"(%alloc) <{offset = 8704 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %13 = "byre.alias"(%alloc) <{offset = 256 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %14 = "byre.alias"(%alloc) <{offset = 768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg227, %arg25, %arg24, %12, %13, %14) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg220, %arg221, %arg222) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %15 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - %16 = "byre.alias"(%alloc) {offset = 4864 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %17 = "byre.alias"(%alloc) {offset = 1280 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %15 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %16 = "byre.alias"(%alloc) <{offset = 4864 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %17 = "byre.alias"(%alloc) <{offset = 1280 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg222, %arg18, %arg17, %15, %16, %17) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg223, %arg224, %arg225) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %18 = "byre.alias"(%alloc) {offset = 1792 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %19 = "byre.alias"(%alloc) {offset = 2304 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %18 = "byre.alias"(%alloc) <{offset = 1792 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %19 = "byre.alias"(%alloc) <{offset = 2304 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg225, %arg20, %arg19, %15, %18, %19) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg228, %arg229, %arg230) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %20 = "byre.alias"(%alloc) {offset = 2816 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %21 = "byre.alias"(%alloc) {offset = 3328 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %20 = "byre.alias"(%alloc) <{offset = 2816 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %21 = "byre.alias"(%alloc) <{offset = 3328 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg230, %arg27, %arg26, %15, %20, %21) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown27", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg231, %arg232, %arg233) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %22 = "byre.alias"(%alloc) {offset = 3840 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %23 = "byre.alias"(%alloc) {offset = 4352 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %22 = "byre.alias"(%alloc) <{offset = 3840 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %23 = "byre.alias"(%alloc) <{offset = 4352 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg233, %arg29, %arg28, %15, %22, %23) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown29", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> + byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg234, %arg240, %arg241) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %24 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - %25 = "byre.alias"(%alloc) {offset = 223744 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %26 = "byre.alias"(%alloc) {offset = 1836544 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %24 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %25 = "byre.alias"(%alloc) <{offset = 223744 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %26 = "byre.alias"(%alloc) <{offset = 1836544 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg241, %arg40, %arg39, %24, %25, %26) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg234, %arg235, %arg236) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %27 = "byre.alias"(%alloc) {offset = 325120 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - %28 = "byre.alias"(%alloc) {offset = 1835520 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %29 = "byre.alias"(%alloc) {offset = 1834496 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %27 = "byre.alias"(%alloc) <{offset = 325120 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %28 = "byre.alias"(%alloc) <{offset = 1835520 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %29 = "byre.alias"(%alloc) <{offset = 1834496 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg236, %arg33, %arg32, %27, %28, %29) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg237, %arg238, %arg239) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %30 = "byre.alias"(%alloc) {offset = 1833472 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %31 = "byre.alias"(%alloc) {offset = 1837568 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %30 = "byre.alias"(%alloc) <{offset = 1833472 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %31 = "byre.alias"(%alloc) <{offset = 1837568 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg239, %arg35, %arg34, %27, %30, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg242, %arg243, %arg244) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %32 = "byre.alias"(%alloc) {offset = 1832448 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %33 = "byre.alias"(%alloc) {offset = 1831424 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %32 = "byre.alias"(%alloc) <{offset = 1832448 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %33 = "byre.alias"(%alloc) <{offset = 1831424 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg244, %arg42, %arg41, %24, %32, %33) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown40", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown41", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg245, %arg246, %arg247) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %34 = "byre.alias"(%alloc) {offset = 1830400 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %35 = "byre.alias"(%alloc) {offset = 7680 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %34 = "byre.alias"(%alloc) <{offset = 1830400 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %35 = "byre.alias"(%alloc) <{offset = 7680 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg247, %arg44, %arg43, %24, %34, %35) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown43", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> + byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg248, %arg254, %arg255) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %36 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - %37 = "byre.alias"(%alloc) {offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %38 = "byre.alias"(%alloc) {offset = 209408 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %36 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %37 = "byre.alias"(%alloc) <{offset = 8704 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %38 = "byre.alias"(%alloc) <{offset = 209408 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg255, %arg55, %arg54, %36, %37, %38) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg248, %arg249, %arg250) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %39 = "byre.alias"(%alloc) {offset = 274944 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - %40 = "byre.alias"(%alloc) {offset = 12800 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %41 = "byre.alias"(%alloc) {offset = 10752 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %39 = "byre.alias"(%alloc) <{offset = 274944 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %40 = "byre.alias"(%alloc) <{offset = 12800 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %41 = "byre.alias"(%alloc) <{offset = 10752 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg250, %arg48, %arg47, %39, %40, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg251, %arg252, %arg253) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %42 = "byre.alias"(%alloc) {offset = 211456 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %43 = "byre.alias"(%alloc) {offset = 213504 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %42 = "byre.alias"(%alloc) <{offset = 211456 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %43 = "byre.alias"(%alloc) <{offset = 213504 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg253, %arg50, %arg49, %39, %42, %43) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown52", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg256, %arg257, %arg258) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %44 = "byre.alias"(%alloc) {offset = 215552 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %45 = "byre.alias"(%alloc) {offset = 217600 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %44 = "byre.alias"(%alloc) <{offset = 215552 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %45 = "byre.alias"(%alloc) <{offset = 217600 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg258, %arg57, %arg56, %36, %44, %45) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown54", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg259, %arg260, %arg261) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %46 = "byre.alias"(%alloc) {offset = 219648 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %47 = "byre.alias"(%alloc) {offset = 221696 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %46 = "byre.alias"(%alloc) <{offset = 219648 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %47 = "byre.alias"(%alloc) <{offset = 221696 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg261, %arg59, %arg58, %36, %46, %47) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %48 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%arg262, %48) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda"> - byre.compute @PTXOp(%48, %arg263) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown58", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda"> - %49 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%arg4, %49) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> - byre.compute @TransposeOp_f16_f16(%49, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda"> - %50 = "byre.alias"(%alloc) {offset = 14848 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%arg263, %49, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda"> - byre.compute @PTXOp(%arg3, %50, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda"> - byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown67", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown71", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown75", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown81", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown88", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown97", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown98", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown99", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown100", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %48 = "byre.alias"(%arg262) <{offset = 0 : i64}> : (memref<1x512x7x7xf16, "cuda">) -> memref<512x49xf16, "cuda"> + %49 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf16, "cuda"> + byre.compute @PTXOp(%48, %49) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 512 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown58_kernel"} : memref<512x49xf16, "cuda">, memref<512xf16, "cuda"> + %50 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda"> + byre.compute @PTXOp(%50, %arg263) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda"> + %51 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%arg4, %51) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> + byre.compute @TransposeOp_f16_f16(%51, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda"> + %52 = "byre.alias"(%alloc) <{offset = 14848 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%arg263, %51, %52) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda"> + byre.compute @PTXOp(%arg3, %52, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda"> + byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> byre.copy(%arg0, %arg124) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.copy(%arg1, %arg125) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.copy(%arg5, %arg126) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda"> diff --git a/compiler/test/E2E/ResNet18/FW/9b_nvvm_codegen.mlir b/compiler/test/E2E/ResNet18/FW/9b_nvvm_codegen.mlir index 1ed14037d..5964c86c0 100644 --- a/compiler/test/E2E/ResNet18/FW/9b_nvvm_codegen.mlir +++ b/compiler/test/E2E/ResNet18/FW/9b_nvvm_codegen.mlir @@ -4,2682 +4,927 @@ module attributes {byre.container_module, gpu.container_module} { gpu.module @unified { - gpu.func @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<512xf32> - %7 = memref.load %arg1[%4] : memref<512xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<512xf32> - } - gpu.return - } - gpu.func @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c256 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<256xf32> - %7 = memref.load %arg1[%4] : memref<256xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<256xf32> - } - gpu.return - } - gpu.func @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c128 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<128xf32> - %7 = memref.load %arg1[%4] : memref<128xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<128xf32> - } - gpu.return - } - gpu.func @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> - } - gpu.return - } - gpu.func @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { - %cst = arith.constant 0.899999976 : f32 - %cst_0 = arith.constant 1.000000e-01 : f32 - %c64 = arith.constant 64 : index + gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c64 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<64xf32> - %7 = memref.load %arg1[%4] : memref<64xf32> - %8 = arith.mulf %7, %cst : f32 - %9 = arith.mulf %6, %cst_0 : f32 - %10 = arith.addf %9, %8 : f32 - memref.store %10, %arg2[%4] : memref<64xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c512 step %6 { + %7 = memref.load %arg1[%arg3] : memref<512xf32> + %8 = memref.load %arg0[%arg3] : memref<512xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<512xf32> } gpu.return } - gpu.func @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel { - %c0 = arith.constant 0 : index - %c1000 = arith.constant 1000 : index + gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg1[%c0, %4] : memref<1x1000xf16> - %7 = memref.load %arg0[%4] : memref<1000xf32> - %8 = arith.truncf %7 : f32 to f16 - %9 = arith.addf %6, %8 : f16 - memref.store %9, %arg2[%c0, %4] : memref<1x1000xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c256 step %6 { + %7 = memref.load %arg1[%arg3] : memref<256xf32> + %8 = memref.load %arg0[%arg3] : memref<256xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<256xf32> } gpu.return } - gpu.func @Unknown59(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { - %c0 = arith.constant 0 : index - %c512000 = arith.constant 512000 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c128 step %6 { + %7 = memref.load %arg1[%arg3] : memref<128xf32> + %8 = memref.load %arg0[%arg3] : memref<128xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<128xf32> } gpu.return } - gpu.func @Unknown58(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel { - %cst = arith.constant 2.040100e-02 : f16 - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index + gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel { + %cst = arith.constant 1.000000e-01 : f32 + %cst_0 = arith.constant 0.899999976 : f32 + %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512 : index - scf.if %5 { - %6 = memref.load %arg0[%c0, %4] : memref<1x512xf16> - %7 = arith.mulf %6, %cst : f16 - memref.store %7, %arg1[%c0, %4] : memref<1x512xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c64 step %6 { + %7 = memref.load %arg1[%arg3] : memref<64xf32> + %8 = memref.load %arg0[%arg3] : memref<64xf32> + %9 = arith.mulf %7, %cst_0 : f32 + %10 = arith.mulf %8, %cst : f32 + %11 = arith.addf %10, %9 : f32 + memref.store %11, %arg2[%arg3] : memref<64xf32> } gpu.return } - gpu.func @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 + gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel { %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg3] : memref<1000xf32> + %8 = memref.load %arg1[%c0, %arg3] : memref<1x1000xf16> + %9 = arith.truncf %7 : f32 to f16 + %10 = arith.addf %8, %9 : f16 + memref.store %10, %arg2[%c0, %arg3] : memref<1x1000xf16> } gpu.return } - gpu.func @Unknown55(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown60(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { + %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown54(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf16> } gpu.return } - gpu.func @Unknown52(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { + gpu.func @Unknown59(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel { + %cst = arith.constant 2.040100e-02 : f16 %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512 step %6 { + %7 = memref.load %arg0[%c0, %arg2] : memref<1x512xf16> + %8 = arith.mulf %7, %cst : f16 + memref.store %8, %arg1[%c0, %arg2] : memref<1x512xf16> } gpu.return } gpu.func @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown49(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16> } gpu.return } gpu.func @Unknown48(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel { + %c25088 = arith.constant 25088 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c25088 = arith.constant 25088 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c25088 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c25088 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16> } gpu.return } gpu.func @Unknown46(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16> } gpu.return } gpu.func @Unknown44(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c131072 = arith.constant 131072 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - } - gpu.return - } - gpu.func @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown41(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown40(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - } - gpu.return - } - gpu.func @Unknown38(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> } gpu.return } gpu.func @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown35(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16> } gpu.return } gpu.func @Unknown34(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel { + %c50176 = arith.constant 50176 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c50176 = arith.constant 50176 : index %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c50176 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c50176 step %6 { + %7 = arith.remsi %arg2, %c14 : index + %8 = arith.divsi %arg2, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16> } gpu.return } gpu.func @Unknown32(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c294912 = arith.constant 294912 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16> } gpu.return } gpu.func @Unknown30(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c32768 = arith.constant 32768 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - } - gpu.return - } - gpu.func @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown27(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown26(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - } - gpu.return - } - gpu.func @Unknown24(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> } gpu.return } gpu.func @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown21(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16> } gpu.return } gpu.func @Unknown20(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg2, %c28 : index + %8 = arith.divsi %arg2, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16> } gpu.return } gpu.func @Unknown18(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16> } gpu.return } gpu.func @Unknown16(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c8192 = arith.constant 8192 : index - %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - } - gpu.return - } - gpu.func @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown13(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown12(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown10(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> } gpu.return } gpu.func @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %28 = arith.addf %26, %27 : f16 - %29 = arith.maxnumf %28, %cst : f16 - memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - } - gpu.return - } - gpu.func @Unknown7(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %13 = arith.addf %11, %12 : f16 + %14 = arith.maximumf %13, %cst : f16 + memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown6(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg2, %c56 : index + %8 = arith.divsi %arg2, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16> } gpu.return } gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16> } gpu.return } gpu.func @Unknown3(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16> - %27 = arith.maxnumf %26, %cst : f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg2, %c112 : index + %8 = arith.divsi %arg2, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16> + %12 = arith.maximumf %11, %cst : f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16> } gpu.return } gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { - %c0 = arith.constant 0 : index %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16> } gpu.return } gpu.func @Unknown0(%arg0: memref<1x3x224x224xf32>, %arg1: memref<1x3x224x224xf16>) kernel { - %c0 = arith.constant 0 : index %c150528 = arith.constant 150528 : index + %c0 = arith.constant 0 : index %c224 = arith.constant 224 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c150528 : index - scf.if %5 { - %6 = arith.remsi %4, %c224 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c224 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c224 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c224 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c224 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c224 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x3x224x224xf32> - %27 = arith.truncf %26 : f32 to f16 - memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x3x224x224xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c150528 step %6 { + %7 = arith.remsi %arg2, %c224 : index + %8 = arith.divsi %arg2, %c224 : index + %9 = arith.remsi %8, %c224 : index + %10 = arith.divsi %8, %c224 : index + %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x3x224x224xf32> + %12 = arith.truncf %11 : f32 to f16 + memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x3x224x224xf16> } gpu.return } + gpu.func @Unknown58_kernel(%arg0: memref<512x49xf16>, %arg1: memref<512xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c49 = arith.constant 49 : index + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.remsi %1, %c64 : index + %3 = arith.cmpi slt, %2, %c0 : index + %4 = arith.addi %2, %c64 : index + %5 = arith.select %3, %4, %2 : index + %6 = arith.cmpi slt, %5, %c49 : index + %7 = arith.select %6, %5, %c49 : index + %8 = arith.addi %5, %c1 : index + %9 = arith.cmpi slt, %8, %c49 : index + %10 = arith.select %9, %8, %c49 : index + %11 = arith.subi %10, %7 : index + %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %12 = arith.cmpi ugt, %11, %c0 : index + %13 = scf.if %12 -> (f16) { + %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %21 : f16 + } else { + scf.yield %cst : f16 + } + %14 = arith.addf %13, %cst : f16 + memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space> + %15 = arith.cmpi ult, %1, %c32 : index + scf.if %15 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space> + %16 = arith.cmpi ult, %1, %c16 : index + scf.if %16 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space> + %17 = arith.cmpi ult, %1, %c8 : index + scf.if %17 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space> + %18 = arith.cmpi ult, %1, %c4 : index + scf.if %18 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space> + %19 = arith.cmpi ult, %1, %c2 : index + scf.if %19 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %20 = arith.cmpi ult, %1, %c1 : index + scf.if %20 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %arg1[%0] : memref<512xf16> + } + gpu.barrier + gpu.return + } } func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<1000xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<1000x512xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<128xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<128xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<128xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<128xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<256xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<256xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<256xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<256xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<256xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<256xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<256xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<256xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<256xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<256xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<512xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<512xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<512xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<512xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<512xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<512xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<512xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<512xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<512xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<512xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<64xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<64xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<64xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<64xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<64xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<64xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<64xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<64xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<64xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<64xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<128xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<128xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<128xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<128xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<128xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<128xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<128xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<128xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<128xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<128xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<256xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<256xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<256xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<256xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<256xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<256xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<256xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<256xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<256xf32, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<256xf32, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<512xf32, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<512xf32, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<512xf32, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<512xf32, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<512xf32, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<512xf32, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<512xf32, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<512xf32, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<512xf32, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<512xf32, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<1x3x224x224xf32, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x1000xf16, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg124: memref<64xf32, "cuda"> {byre.arg_alias_index = 0 : i64, byre.argname = "Output1", byre.argtype = 2 : i32}, %arg125: memref<64xf32, "cuda"> {byre.arg_alias_index = 1 : i64, byre.argname = "Output2", byre.argtype = 2 : i32}, %arg126: memref<64xf32, "cuda"> {byre.arg_alias_index = 5 : i64, byre.argname = "Output3", byre.argtype = 2 : i32}, %arg127: memref<64xf32, "cuda"> {byre.arg_alias_index = 6 : i64, byre.argname = "Output4", byre.argtype = 2 : i32}, %arg128: memref<64xf32, "cuda"> {byre.arg_alias_index = 7 : i64, byre.argname = "Output5", byre.argtype = 2 : i32}, %arg129: memref<64xf32, "cuda"> {byre.arg_alias_index = 8 : i64, byre.argname = "Output6", byre.argtype = 2 : i32}, %arg130: memref<64xf32, "cuda"> {byre.arg_alias_index = 11 : i64, byre.argname = "Output7", byre.argtype = 2 : i32}, %arg131: memref<64xf32, "cuda"> {byre.arg_alias_index = 12 : i64, byre.argname = "Output8", byre.argtype = 2 : i32}, %arg132: memref<64xf32, "cuda"> {byre.arg_alias_index = 13 : i64, byre.argname = "Output9", byre.argtype = 2 : i32}, %arg133: memref<64xf32, "cuda"> {byre.arg_alias_index = 14 : i64, byre.argname = "Output10", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.arg_alias_index = 17 : i64, byre.argname = "Output11", byre.argtype = 2 : i32}, %arg135: memref<128xf32, "cuda"> {byre.arg_alias_index = 18 : i64, byre.argname = "Output12", byre.argtype = 2 : i32}, %arg136: memref<128xf32, "cuda"> {byre.arg_alias_index = 19 : i64, byre.argname = "Output13", byre.argtype = 2 : i32}, %arg137: memref<128xf32, "cuda"> {byre.arg_alias_index = 20 : i64, byre.argname = "Output14", byre.argtype = 2 : i32}, %arg138: memref<128xf32, "cuda"> {byre.arg_alias_index = 24 : i64, byre.argname = "Output15", byre.argtype = 2 : i32}, %arg139: memref<128xf32, "cuda"> {byre.arg_alias_index = 25 : i64, byre.argname = "Output16", byre.argtype = 2 : i32}, %arg140: memref<128xf32, "cuda"> {byre.arg_alias_index = 26 : i64, byre.argname = "Output17", byre.argtype = 2 : i32}, %arg141: memref<128xf32, "cuda"> {byre.arg_alias_index = 27 : i64, byre.argname = "Output18", byre.argtype = 2 : i32}, %arg142: memref<128xf32, "cuda"> {byre.arg_alias_index = 28 : i64, byre.argname = "Output19", byre.argtype = 2 : i32}, %arg143: memref<128xf32, "cuda"> {byre.arg_alias_index = 29 : i64, byre.argname = "Output20", byre.argtype = 2 : i32}, %arg144: memref<256xf32, "cuda"> {byre.arg_alias_index = 32 : i64, byre.argname = "Output21", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.arg_alias_index = 33 : i64, byre.argname = "Output22", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.arg_alias_index = 34 : i64, byre.argname = "Output23", byre.argtype = 2 : i32}, %arg147: memref<256xf32, "cuda"> {byre.arg_alias_index = 35 : i64, byre.argname = "Output24", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.arg_alias_index = 39 : i64, byre.argname = "Output25", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.arg_alias_index = 40 : i64, byre.argname = "Output26", byre.argtype = 2 : i32}, %arg150: memref<256xf32, "cuda"> {byre.arg_alias_index = 41 : i64, byre.argname = "Output27", byre.argtype = 2 : i32}, %arg151: memref<256xf32, "cuda"> {byre.arg_alias_index = 42 : i64, byre.argname = "Output28", byre.argtype = 2 : i32}, %arg152: memref<256xf32, "cuda"> {byre.arg_alias_index = 43 : i64, byre.argname = "Output29", byre.argtype = 2 : i32}, %arg153: memref<256xf32, "cuda"> {byre.arg_alias_index = 44 : i64, byre.argname = "Output30", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.arg_alias_index = 47 : i64, byre.argname = "Output31", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.arg_alias_index = 48 : i64, byre.argname = "Output32", byre.argtype = 2 : i32}, %arg156: memref<512xf32, "cuda"> {byre.arg_alias_index = 49 : i64, byre.argname = "Output33", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.arg_alias_index = 50 : i64, byre.argname = "Output34", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.arg_alias_index = 54 : i64, byre.argname = "Output35", byre.argtype = 2 : i32}, %arg159: memref<512xf32, "cuda"> {byre.arg_alias_index = 55 : i64, byre.argname = "Output36", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.arg_alias_index = 56 : i64, byre.argname = "Output37", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.arg_alias_index = 57 : i64, byre.argname = "Output38", byre.argtype = 2 : i32}, %arg162: memref<512xf32, "cuda"> {byre.arg_alias_index = 58 : i64, byre.argname = "Output39", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.arg_alias_index = 59 : i64, byre.argname = "Output40", byre.argtype = 2 : i32}, %arg164: memref<64xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg165: memref<64xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg166: memref<64xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg167: memref<64xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg168: memref<64xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg169: memref<64xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg170: memref<64xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg171: memref<64xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg172: memref<64xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg173: memref<64xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg174: memref<128xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg175: memref<128xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg176: memref<128xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg177: memref<128xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg178: memref<128xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg179: memref<128xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg180: memref<128xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg181: memref<128xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg182: memref<128xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg183: memref<128xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output63", byre.argtype = 2 : i32}, %arg187: memref<256xf32, "cuda"> {byre.argname = "Output64", byre.argtype = 2 : i32}, %arg188: memref<256xf32, "cuda"> {byre.argname = "Output65", byre.argtype = 2 : i32}, %arg189: memref<256xf32, "cuda"> {byre.argname = "Output66", byre.argtype = 2 : i32}, %arg190: memref<256xf32, "cuda"> {byre.argname = "Output67", byre.argtype = 2 : i32}, %arg191: memref<256xf32, "cuda"> {byre.argname = "Output68", byre.argtype = 2 : i32}, %arg192: memref<256xf32, "cuda"> {byre.argname = "Output69", byre.argtype = 2 : i32}, %arg193: memref<256xf32, "cuda"> {byre.argname = "Output70", byre.argtype = 2 : i32}, %arg194: memref<512xf32, "cuda"> {byre.argname = "Output71", byre.argtype = 2 : i32}, %arg195: memref<512xf32, "cuda"> {byre.argname = "Output72", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output73", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output74", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output75", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output76", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output77", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output78", byre.argtype = 2 : i32}, %arg202: memref<512xf32, "cuda"> {byre.argname = "Output79", byre.argtype = 2 : i32}, %arg203: memref<512xf32, "cuda"> {byre.argname = "Output80", byre.argtype = 2 : i32}, %arg204: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Output81", byre.argtype = 2 : i32}, %arg205: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Output82", byre.argtype = 2 : i32}, %arg206: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output83", byre.argtype = 2 : i32}, %arg207: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output84", byre.argtype = 2 : i32}, %arg208: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output85", byre.argtype = 2 : i32}, %arg209: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output86", byre.argtype = 2 : i32}, %arg210: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output87", byre.argtype = 2 : i32}, %arg211: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output88", byre.argtype = 2 : i32}, %arg212: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output89", byre.argtype = 2 : i32}, %arg213: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output90", byre.argtype = 2 : i32}, %arg214: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output91", byre.argtype = 2 : i32}, %arg215: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output92", byre.argtype = 2 : i32}, %arg216: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output93", byre.argtype = 2 : i32}, %arg217: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output94", byre.argtype = 2 : i32}, %arg218: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output95", byre.argtype = 2 : i32}, %arg219: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output96", byre.argtype = 2 : i32}, %arg220: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output97", byre.argtype = 2 : i32}, %arg221: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Output98", byre.argtype = 2 : i32}, %arg222: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output99", byre.argtype = 2 : i32}, %arg223: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output100", byre.argtype = 2 : i32}, %arg224: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output101", byre.argtype = 2 : i32}, %arg225: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output102", byre.argtype = 2 : i32}, %arg226: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Output103", byre.argtype = 2 : i32}, %arg227: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output104", byre.argtype = 2 : i32}, %arg228: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output105", byre.argtype = 2 : i32}, %arg229: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output106", byre.argtype = 2 : i32}, %arg230: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output107", byre.argtype = 2 : i32}, %arg231: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output108", byre.argtype = 2 : i32}, %arg232: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output109", byre.argtype = 2 : i32}, %arg233: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output110", byre.argtype = 2 : i32}, %arg234: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output111", byre.argtype = 2 : i32}, %arg235: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Output112", byre.argtype = 2 : i32}, %arg236: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output113", byre.argtype = 2 : i32}, %arg237: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output114", byre.argtype = 2 : i32}, %arg238: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output115", byre.argtype = 2 : i32}, %arg239: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output116", byre.argtype = 2 : i32}, %arg240: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Output117", byre.argtype = 2 : i32}, %arg241: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output118", byre.argtype = 2 : i32}, %arg242: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output119", byre.argtype = 2 : i32}, %arg243: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output120", byre.argtype = 2 : i32}, %arg244: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output121", byre.argtype = 2 : i32}, %arg245: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output122", byre.argtype = 2 : i32}, %arg246: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output123", byre.argtype = 2 : i32}, %arg247: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output124", byre.argtype = 2 : i32}, %arg248: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output125", byre.argtype = 2 : i32}, %arg249: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Output126", byre.argtype = 2 : i32}, %arg250: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output127", byre.argtype = 2 : i32}, %arg251: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output128", byre.argtype = 2 : i32}, %arg252: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output129", byre.argtype = 2 : i32}, %arg253: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output130", byre.argtype = 2 : i32}, %arg254: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Output131", byre.argtype = 2 : i32}, %arg255: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output132", byre.argtype = 2 : i32}, %arg256: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output133", byre.argtype = 2 : i32}, %arg257: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output134", byre.argtype = 2 : i32}, %arg258: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output135", byre.argtype = 2 : i32}, %arg259: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output136", byre.argtype = 2 : i32}, %arg260: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output137", byre.argtype = 2 : i32}, %arg261: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output138", byre.argtype = 2 : i32}, %arg262: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output139", byre.argtype = 2 : i32}, %arg263: memref<1x512xf16, "cuda"> {byre.argname = "Output140", byre.argtype = 2 : i32}, %arg264: memref<512x1000xf16, "cuda"> {byre.argname = "Output141", byre.argtype = 2 : i32}) attributes {byre.entry_point} { %alloc = memref.alloc() : memref<1838592xi8, "cuda"> - byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 128 : i32, GridSize.x = 1176 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda"> - byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 256 : i32, GridSize.x = 147 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda"> + byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg205, %arg204, %arg206) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> - %0 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> - %1 = "byre.alias"(%alloc) {offset = 7424 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %2 = "byre.alias"(%alloc) {offset = 7168 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 7424 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 7168 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg206, %arg1, %arg0, %0, %1, %2) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> byre.compute @PoolMaxOp_f16_f16(%arg207, %arg208) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg208, %arg209, %arg210) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %3 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> - %4 = "byre.alias"(%alloc) {offset = 6912 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %5 = "byre.alias"(%alloc) {offset = 6656 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %3 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %4 = "byre.alias"(%alloc) <{offset = 6912 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %5 = "byre.alias"(%alloc) <{offset = 6656 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg210, %arg6, %arg5, %3, %4, %5) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg211, %arg212, %arg213) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %6 = "byre.alias"(%alloc) {offset = 6400 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %7 = "byre.alias"(%alloc) {offset = 6144 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %6 = "byre.alias"(%alloc) <{offset = 6400 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %7 = "byre.alias"(%alloc) <{offset = 6144 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg213, %arg8, %arg7, %3, %6, %7) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg214, %arg215, %arg216) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %8 = "byre.alias"(%alloc) {offset = 5888 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %9 = "byre.alias"(%alloc) {offset = 5632 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %8 = "byre.alias"(%alloc) <{offset = 5888 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %9 = "byre.alias"(%alloc) <{offset = 5632 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg216, %arg12, %arg11, %3, %8, %9) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg217, %arg218, %arg219) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %10 = "byre.alias"(%alloc) {offset = 5376 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %11 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %10 = "byre.alias"(%alloc) <{offset = 5376 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %11 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg219, %arg14, %arg13, %3, %10, %11) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown15", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> + byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg220, %arg226, %arg227) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %12 = "byre.alias"(%alloc) {offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - %13 = "byre.alias"(%alloc) {offset = 256 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %14 = "byre.alias"(%alloc) {offset = 768 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %12 = "byre.alias"(%alloc) <{offset = 8704 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %13 = "byre.alias"(%alloc) <{offset = 256 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %14 = "byre.alias"(%alloc) <{offset = 768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg227, %arg25, %arg24, %12, %13, %14) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg220, %arg221, %arg222) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %15 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - %16 = "byre.alias"(%alloc) {offset = 4864 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %17 = "byre.alias"(%alloc) {offset = 1280 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %15 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %16 = "byre.alias"(%alloc) <{offset = 4864 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %17 = "byre.alias"(%alloc) <{offset = 1280 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg222, %arg18, %arg17, %15, %16, %17) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg223, %arg224, %arg225) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %18 = "byre.alias"(%alloc) {offset = 1792 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %19 = "byre.alias"(%alloc) {offset = 2304 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %18 = "byre.alias"(%alloc) <{offset = 1792 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %19 = "byre.alias"(%alloc) <{offset = 2304 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg225, %arg20, %arg19, %15, %18, %19) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg228, %arg229, %arg230) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %20 = "byre.alias"(%alloc) {offset = 2816 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %21 = "byre.alias"(%alloc) {offset = 3328 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %20 = "byre.alias"(%alloc) <{offset = 2816 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %21 = "byre.alias"(%alloc) <{offset = 3328 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg230, %arg27, %arg26, %15, %20, %21) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown27", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg231, %arg232, %arg233) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %22 = "byre.alias"(%alloc) {offset = 3840 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %23 = "byre.alias"(%alloc) {offset = 4352 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %22 = "byre.alias"(%alloc) <{offset = 3840 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %23 = "byre.alias"(%alloc) <{offset = 4352 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg233, %arg29, %arg28, %15, %22, %23) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown29", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> + byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg234, %arg240, %arg241) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %24 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - %25 = "byre.alias"(%alloc) {offset = 223744 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %26 = "byre.alias"(%alloc) {offset = 1836544 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %24 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %25 = "byre.alias"(%alloc) <{offset = 223744 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %26 = "byre.alias"(%alloc) <{offset = 1836544 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg241, %arg40, %arg39, %24, %25, %26) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg234, %arg235, %arg236) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %27 = "byre.alias"(%alloc) {offset = 325120 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - %28 = "byre.alias"(%alloc) {offset = 1835520 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %29 = "byre.alias"(%alloc) {offset = 1834496 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %27 = "byre.alias"(%alloc) <{offset = 325120 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %28 = "byre.alias"(%alloc) <{offset = 1835520 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %29 = "byre.alias"(%alloc) <{offset = 1834496 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg236, %arg33, %arg32, %27, %28, %29) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg237, %arg238, %arg239) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %30 = "byre.alias"(%alloc) {offset = 1833472 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %31 = "byre.alias"(%alloc) {offset = 1837568 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %30 = "byre.alias"(%alloc) <{offset = 1833472 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %31 = "byre.alias"(%alloc) <{offset = 1837568 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg239, %arg35, %arg34, %27, %30, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg242, %arg243, %arg244) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %32 = "byre.alias"(%alloc) {offset = 1832448 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %33 = "byre.alias"(%alloc) {offset = 1831424 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %32 = "byre.alias"(%alloc) <{offset = 1832448 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %33 = "byre.alias"(%alloc) <{offset = 1831424 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg244, %arg42, %arg41, %24, %32, %33) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown40", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown41", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg245, %arg246, %arg247) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %34 = "byre.alias"(%alloc) {offset = 1830400 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %35 = "byre.alias"(%alloc) {offset = 7680 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %34 = "byre.alias"(%alloc) <{offset = 1830400 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %35 = "byre.alias"(%alloc) <{offset = 7680 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg247, %arg44, %arg43, %24, %34, %35) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown43", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> + byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg248, %arg254, %arg255) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %36 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - %37 = "byre.alias"(%alloc) {offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %38 = "byre.alias"(%alloc) {offset = 209408 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %36 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %37 = "byre.alias"(%alloc) <{offset = 8704 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %38 = "byre.alias"(%alloc) <{offset = 209408 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg255, %arg55, %arg54, %36, %37, %38) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg248, %arg249, %arg250) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %39 = "byre.alias"(%alloc) {offset = 274944 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - %40 = "byre.alias"(%alloc) {offset = 12800 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %41 = "byre.alias"(%alloc) {offset = 10752 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %39 = "byre.alias"(%alloc) <{offset = 274944 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %40 = "byre.alias"(%alloc) <{offset = 12800 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %41 = "byre.alias"(%alloc) <{offset = 10752 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg250, %arg48, %arg47, %39, %40, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg251, %arg252, %arg253) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %42 = "byre.alias"(%alloc) {offset = 211456 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %43 = "byre.alias"(%alloc) {offset = 213504 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %42 = "byre.alias"(%alloc) <{offset = 211456 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %43 = "byre.alias"(%alloc) <{offset = 213504 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg253, %arg50, %arg49, %39, %42, %43) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown52", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg256, %arg257, %arg258) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %44 = "byre.alias"(%alloc) {offset = 215552 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %45 = "byre.alias"(%alloc) {offset = 217600 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %44 = "byre.alias"(%alloc) <{offset = 215552 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %45 = "byre.alias"(%alloc) <{offset = 217600 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg258, %arg57, %arg56, %36, %44, %45) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown54", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg259, %arg260, %arg261) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %46 = "byre.alias"(%alloc) {offset = 219648 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %47 = "byre.alias"(%alloc) {offset = 221696 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %46 = "byre.alias"(%alloc) <{offset = 219648 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %47 = "byre.alias"(%alloc) <{offset = 221696 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg261, %arg59, %arg58, %36, %46, %47) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %48 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%arg262, %48) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda"> - byre.compute @PTXOp(%48, %arg263) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown58", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda"> - %49 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%arg4, %49) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> - byre.compute @TransposeOp_f16_f16(%49, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda"> - %50 = "byre.alias"(%alloc) {offset = 14848 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%arg263, %49, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda"> - byre.compute @PTXOp(%arg3, %50, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda"> - byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown67", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown71", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown75", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown81", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown88", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown97", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown98", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown99", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown100", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %48 = "byre.alias"(%arg262) <{offset = 0 : i64}> : (memref<1x512x7x7xf16, "cuda">) -> memref<512x49xf16, "cuda"> + %49 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf16, "cuda"> + byre.compute @PTXOp(%48, %49) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 512 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown58_kernel"} : memref<512x49xf16, "cuda">, memref<512xf16, "cuda"> + %50 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda"> + byre.compute @PTXOp(%50, %arg263) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda"> + %51 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%arg4, %51) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> + byre.compute @TransposeOp_f16_f16(%51, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda"> + %52 = "byre.alias"(%alloc) <{offset = 14848 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%arg263, %51, %52) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda"> + byre.compute @PTXOp(%arg3, %52, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda"> + byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> byre.copy(%arg0, %arg124) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.copy(%arg1, %arg125) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.copy(%arg5, %arg126) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda"> diff --git a/compiler/test/E2E/ResNet18/FW/device_output.ptx b/compiler/test/E2E/ResNet18/FW/device_output.ptx index 8b9cc4b4e..ec3082837 100644 --- a/compiler/test/E2E/ResNet18/FW/device_output.ptx +++ b/compiler/test/E2E/ResNet18/FW/device_output.ptx @@ -6,424 +6,14 @@ .target sm_70 .address_size 64 - // .globl Unknown100 - -.visible .entry Unknown100( - .param .u64 Unknown100_param_0, - .param .u64 Unknown100_param_1, - .param .u64 Unknown100_param_2, - .param .u64 Unknown100_param_3, - .param .u64 Unknown100_param_4, - .param .u64 Unknown100_param_5, - .param .u64 Unknown100_param_6, - .param .u64 Unknown100_param_7, - .param .u64 Unknown100_param_8, - .param .u64 Unknown100_param_9, - .param .u64 Unknown100_param_10, - .param .u64 Unknown100_param_11, - .param .u64 Unknown100_param_12, - .param .u64 Unknown100_param_13, - .param .u64 Unknown100_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 511; - @%p1 bra $L__BB0_2; - ld.param.u64 %rd5, [Unknown100_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown100_param_1]; - ld.param.u64 %rd7, [Unknown100_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB0_2: - ret; - -} - // .globl Unknown99 -.visible .entry Unknown99( - .param .u64 Unknown99_param_0, - .param .u64 Unknown99_param_1, - .param .u64 Unknown99_param_2, - .param .u64 Unknown99_param_3, - .param .u64 Unknown99_param_4, - .param .u64 Unknown99_param_5, - .param .u64 Unknown99_param_6, - .param .u64 Unknown99_param_7, - .param .u64 Unknown99_param_8, - .param .u64 Unknown99_param_9, - .param .u64 Unknown99_param_10, - .param .u64 Unknown99_param_11, - .param .u64 Unknown99_param_12, - .param .u64 Unknown99_param_13, - .param .u64 Unknown99_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 511; - @%p1 bra $L__BB1_2; - ld.param.u64 %rd5, [Unknown99_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown99_param_1]; - ld.param.u64 %rd7, [Unknown99_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB1_2: - ret; - -} - // .globl Unknown98 -.visible .entry Unknown98( - .param .u64 Unknown98_param_0, - .param .u64 Unknown98_param_1, - .param .u64 Unknown98_param_2, - .param .u64 Unknown98_param_3, - .param .u64 Unknown98_param_4, - .param .u64 Unknown98_param_5, - .param .u64 Unknown98_param_6, - .param .u64 Unknown98_param_7, - .param .u64 Unknown98_param_8, - .param .u64 Unknown98_param_9, - .param .u64 Unknown98_param_10, - .param .u64 Unknown98_param_11, - .param .u64 Unknown98_param_12, - .param .u64 Unknown98_param_13, - .param .u64 Unknown98_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 511; - @%p1 bra $L__BB2_2; - ld.param.u64 %rd5, [Unknown98_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown98_param_1]; - ld.param.u64 %rd7, [Unknown98_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB2_2: - ret; - -} - // .globl Unknown97 -.visible .entry Unknown97( - .param .u64 Unknown97_param_0, - .param .u64 Unknown97_param_1, - .param .u64 Unknown97_param_2, - .param .u64 Unknown97_param_3, - .param .u64 Unknown97_param_4, - .param .u64 Unknown97_param_5, - .param .u64 Unknown97_param_6, - .param .u64 Unknown97_param_7, - .param .u64 Unknown97_param_8, - .param .u64 Unknown97_param_9, - .param .u64 Unknown97_param_10, - .param .u64 Unknown97_param_11, - .param .u64 Unknown97_param_12, - .param .u64 Unknown97_param_13, - .param .u64 Unknown97_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 511; - @%p1 bra $L__BB3_2; - ld.param.u64 %rd5, [Unknown97_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown97_param_1]; - ld.param.u64 %rd7, [Unknown97_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB3_2: - ret; - -} - // .globl Unknown96 -.visible .entry Unknown96( - .param .u64 Unknown96_param_0, - .param .u64 Unknown96_param_1, - .param .u64 Unknown96_param_2, - .param .u64 Unknown96_param_3, - .param .u64 Unknown96_param_4, - .param .u64 Unknown96_param_5, - .param .u64 Unknown96_param_6, - .param .u64 Unknown96_param_7, - .param .u64 Unknown96_param_8, - .param .u64 Unknown96_param_9, - .param .u64 Unknown96_param_10, - .param .u64 Unknown96_param_11, - .param .u64 Unknown96_param_12, - .param .u64 Unknown96_param_13, - .param .u64 Unknown96_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 511; - @%p1 bra $L__BB4_2; - ld.param.u64 %rd5, [Unknown96_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown96_param_1]; - ld.param.u64 %rd7, [Unknown96_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB4_2: - ret; - -} - // .globl Unknown95 -.visible .entry Unknown95( - .param .u64 Unknown95_param_0, - .param .u64 Unknown95_param_1, - .param .u64 Unknown95_param_2, - .param .u64 Unknown95_param_3, - .param .u64 Unknown95_param_4, - .param .u64 Unknown95_param_5, - .param .u64 Unknown95_param_6, - .param .u64 Unknown95_param_7, - .param .u64 Unknown95_param_8, - .param .u64 Unknown95_param_9, - .param .u64 Unknown95_param_10, - .param .u64 Unknown95_param_11, - .param .u64 Unknown95_param_12, - .param .u64 Unknown95_param_13, - .param .u64 Unknown95_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 511; - @%p1 bra $L__BB5_2; - ld.param.u64 %rd5, [Unknown95_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown95_param_1]; - ld.param.u64 %rd7, [Unknown95_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB5_2: - ret; - -} - // .globl Unknown94 -.visible .entry Unknown94( - .param .u64 Unknown94_param_0, - .param .u64 Unknown94_param_1, - .param .u64 Unknown94_param_2, - .param .u64 Unknown94_param_3, - .param .u64 Unknown94_param_4, - .param .u64 Unknown94_param_5, - .param .u64 Unknown94_param_6, - .param .u64 Unknown94_param_7, - .param .u64 Unknown94_param_8, - .param .u64 Unknown94_param_9, - .param .u64 Unknown94_param_10, - .param .u64 Unknown94_param_11, - .param .u64 Unknown94_param_12, - .param .u64 Unknown94_param_13, - .param .u64 Unknown94_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 511; - @%p1 bra $L__BB6_2; - ld.param.u64 %rd5, [Unknown94_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown94_param_1]; - ld.param.u64 %rd7, [Unknown94_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB6_2: - ret; - -} - // .globl Unknown93 -.visible .entry Unknown93( - .param .u64 Unknown93_param_0, - .param .u64 Unknown93_param_1, - .param .u64 Unknown93_param_2, - .param .u64 Unknown93_param_3, - .param .u64 Unknown93_param_4, - .param .u64 Unknown93_param_5, - .param .u64 Unknown93_param_6, - .param .u64 Unknown93_param_7, - .param .u64 Unknown93_param_8, - .param .u64 Unknown93_param_9, - .param .u64 Unknown93_param_10, - .param .u64 Unknown93_param_11, - .param .u64 Unknown93_param_12, - .param .u64 Unknown93_param_13, - .param .u64 Unknown93_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 511; - @%p1 bra $L__BB7_2; - ld.param.u64 %rd5, [Unknown93_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown93_param_1]; - ld.param.u64 %rd7, [Unknown93_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB7_2: - ret; - -} // .globl Unknown92 +// __wg_Unknown58_kernel_0 has been demoted +// __wg_Unknown58_kernel_1 has been demoted +// __wg_Unknown58_kernel_2 has been demoted +// __wg_Unknown58_kernel_3 has been demoted +// __wg_Unknown58_kernel_4 has been demoted +// __wg_Unknown58_kernel_5 has been demoted + .visible .entry Unknown92( .param .u64 Unknown92_param_0, .param .u64 Unknown92_param_1, @@ -442,504 +32,44 @@ $L__BB7_2: .param .u64 Unknown92_param_14 ) { - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 511; - @%p1 bra $L__BB8_2; - ld.param.u64 %rd5, [Unknown92_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown92_param_1]; - ld.param.u64 %rd7, [Unknown92_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB8_2: - ret; - -} - // .globl Unknown91 -.visible .entry Unknown91( - .param .u64 Unknown91_param_0, - .param .u64 Unknown91_param_1, - .param .u64 Unknown91_param_2, - .param .u64 Unknown91_param_3, - .param .u64 Unknown91_param_4, - .param .u64 Unknown91_param_5, - .param .u64 Unknown91_param_6, - .param .u64 Unknown91_param_7, - .param .u64 Unknown91_param_8, - .param .u64 Unknown91_param_9, - .param .u64 Unknown91_param_10, - .param .u64 Unknown91_param_11, - .param .u64 Unknown91_param_12, - .param .u64 Unknown91_param_13, - .param .u64 Unknown91_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 511; - @%p1 bra $L__BB9_2; - ld.param.u64 %rd5, [Unknown91_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown91_param_1]; - ld.param.u64 %rd7, [Unknown91_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB9_2: - ret; - -} - // .globl Unknown90 -.visible .entry Unknown90( - .param .u64 Unknown90_param_0, - .param .u64 Unknown90_param_1, - .param .u64 Unknown90_param_2, - .param .u64 Unknown90_param_3, - .param .u64 Unknown90_param_4, - .param .u64 Unknown90_param_5, - .param .u64 Unknown90_param_6, - .param .u64 Unknown90_param_7, - .param .u64 Unknown90_param_8, - .param .u64 Unknown90_param_9, - .param .u64 Unknown90_param_10, - .param .u64 Unknown90_param_11, - .param .u64 Unknown90_param_12, - .param .u64 Unknown90_param_13, - .param .u64 Unknown90_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 255; - @%p1 bra $L__BB10_2; - ld.param.u64 %rd5, [Unknown90_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown90_param_1]; - ld.param.u64 %rd7, [Unknown90_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB10_2: - ret; - -} - // .globl Unknown89 -.visible .entry Unknown89( - .param .u64 Unknown89_param_0, - .param .u64 Unknown89_param_1, - .param .u64 Unknown89_param_2, - .param .u64 Unknown89_param_3, - .param .u64 Unknown89_param_4, - .param .u64 Unknown89_param_5, - .param .u64 Unknown89_param_6, - .param .u64 Unknown89_param_7, - .param .u64 Unknown89_param_8, - .param .u64 Unknown89_param_9, - .param .u64 Unknown89_param_10, - .param .u64 Unknown89_param_11, - .param .u64 Unknown89_param_12, - .param .u64 Unknown89_param_13, - .param .u64 Unknown89_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 255; - @%p1 bra $L__BB11_2; - ld.param.u64 %rd5, [Unknown89_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown89_param_1]; - ld.param.u64 %rd7, [Unknown89_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB11_2: - ret; - -} - // .globl Unknown88 -.visible .entry Unknown88( - .param .u64 Unknown88_param_0, - .param .u64 Unknown88_param_1, - .param .u64 Unknown88_param_2, - .param .u64 Unknown88_param_3, - .param .u64 Unknown88_param_4, - .param .u64 Unknown88_param_5, - .param .u64 Unknown88_param_6, - .param .u64 Unknown88_param_7, - .param .u64 Unknown88_param_8, - .param .u64 Unknown88_param_9, - .param .u64 Unknown88_param_10, - .param .u64 Unknown88_param_11, - .param .u64 Unknown88_param_12, - .param .u64 Unknown88_param_13, - .param .u64 Unknown88_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 255; - @%p1 bra $L__BB12_2; - ld.param.u64 %rd5, [Unknown88_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown88_param_1]; - ld.param.u64 %rd7, [Unknown88_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB12_2: - ret; - -} - // .globl Unknown87 -.visible .entry Unknown87( - .param .u64 Unknown87_param_0, - .param .u64 Unknown87_param_1, - .param .u64 Unknown87_param_2, - .param .u64 Unknown87_param_3, - .param .u64 Unknown87_param_4, - .param .u64 Unknown87_param_5, - .param .u64 Unknown87_param_6, - .param .u64 Unknown87_param_7, - .param .u64 Unknown87_param_8, - .param .u64 Unknown87_param_9, - .param .u64 Unknown87_param_10, - .param .u64 Unknown87_param_11, - .param .u64 Unknown87_param_12, - .param .u64 Unknown87_param_13, - .param .u64 Unknown87_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 255; - @%p1 bra $L__BB13_2; - ld.param.u64 %rd5, [Unknown87_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown87_param_1]; - ld.param.u64 %rd7, [Unknown87_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB13_2: - ret; - -} - // .globl Unknown86 -.visible .entry Unknown86( - .param .u64 Unknown86_param_0, - .param .u64 Unknown86_param_1, - .param .u64 Unknown86_param_2, - .param .u64 Unknown86_param_3, - .param .u64 Unknown86_param_4, - .param .u64 Unknown86_param_5, - .param .u64 Unknown86_param_6, - .param .u64 Unknown86_param_7, - .param .u64 Unknown86_param_8, - .param .u64 Unknown86_param_9, - .param .u64 Unknown86_param_10, - .param .u64 Unknown86_param_11, - .param .u64 Unknown86_param_12, - .param .u64 Unknown86_param_13, - .param .u64 Unknown86_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 255; - @%p1 bra $L__BB14_2; - ld.param.u64 %rd5, [Unknown86_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown86_param_1]; - ld.param.u64 %rd7, [Unknown86_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB14_2: - ret; - -} - // .globl Unknown85 -.visible .entry Unknown85( - .param .u64 Unknown85_param_0, - .param .u64 Unknown85_param_1, - .param .u64 Unknown85_param_2, - .param .u64 Unknown85_param_3, - .param .u64 Unknown85_param_4, - .param .u64 Unknown85_param_5, - .param .u64 Unknown85_param_6, - .param .u64 Unknown85_param_7, - .param .u64 Unknown85_param_8, - .param .u64 Unknown85_param_9, - .param .u64 Unknown85_param_10, - .param .u64 Unknown85_param_11, - .param .u64 Unknown85_param_12, - .param .u64 Unknown85_param_13, - .param .u64 Unknown85_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 255; - @%p1 bra $L__BB15_2; - ld.param.u64 %rd5, [Unknown85_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown85_param_1]; - ld.param.u64 %rd7, [Unknown85_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB15_2: - ret; - -} - // .globl Unknown84 -.visible .entry Unknown84( - .param .u64 Unknown84_param_0, - .param .u64 Unknown84_param_1, - .param .u64 Unknown84_param_2, - .param .u64 Unknown84_param_3, - .param .u64 Unknown84_param_4, - .param .u64 Unknown84_param_5, - .param .u64 Unknown84_param_6, - .param .u64 Unknown84_param_7, - .param .u64 Unknown84_param_8, - .param .u64 Unknown84_param_9, - .param .u64 Unknown84_param_10, - .param .u64 Unknown84_param_11, - .param .u64 Unknown84_param_12, - .param .u64 Unknown84_param_13, - .param .u64 Unknown84_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 255; - @%p1 bra $L__BB16_2; - ld.param.u64 %rd5, [Unknown84_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown84_param_1]; - ld.param.u64 %rd7, [Unknown84_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB16_2: - ret; - -} - // .globl Unknown83 -.visible .entry Unknown83( - .param .u64 Unknown83_param_0, - .param .u64 Unknown83_param_1, - .param .u64 Unknown83_param_2, - .param .u64 Unknown83_param_3, - .param .u64 Unknown83_param_4, - .param .u64 Unknown83_param_5, - .param .u64 Unknown83_param_6, - .param .u64 Unknown83_param_7, - .param .u64 Unknown83_param_8, - .param .u64 Unknown83_param_9, - .param .u64 Unknown83_param_10, - .param .u64 Unknown83_param_11, - .param .u64 Unknown83_param_12, - .param .u64 Unknown83_param_13, - .param .u64 Unknown83_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b32 %r<5>; .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 255; - @%p1 bra $L__BB17_2; - ld.param.u64 %rd5, [Unknown83_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown83_param_1]; - ld.param.u64 %rd7, [Unknown83_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB17_2: + .reg .b64 %rd<22>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 511; + @%p1 bra $L__BB0_3; + ld.param.u64 %rd12, [Unknown92_param_11]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown92_param_1]; + ld.param.u64 %rd14, [Unknown92_param_6]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 2; + shl.b64 %rd7, %rd5, 2; +$L__BB0_2: + add.s64 %rd17, %rd2, %rd20; + ld.global.nc.f32 %f1, [%rd17]; + add.s64 %rd18, %rd3, %rd20; + ld.global.nc.f32 %f2, [%rd18]; + mul.rn.f32 %f3, %f1, 0f3F666666; + mul.rn.f32 %f4, %f2, 0f3DCCCCCD; + add.rn.f32 %f5, %f3, %f4; + add.s64 %rd19, %rd1, %rd20; + st.global.f32 [%rd19], %f5; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p2, %rd21, 512; + @%p2 bra $L__BB0_2; +$L__BB0_3: ret; } @@ -962,504 +92,44 @@ $L__BB17_2: .param .u64 Unknown82_param_14 ) { - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 255; - @%p1 bra $L__BB18_2; - ld.param.u64 %rd5, [Unknown82_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown82_param_1]; - ld.param.u64 %rd7, [Unknown82_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB18_2: - ret; - -} - // .globl Unknown81 -.visible .entry Unknown81( - .param .u64 Unknown81_param_0, - .param .u64 Unknown81_param_1, - .param .u64 Unknown81_param_2, - .param .u64 Unknown81_param_3, - .param .u64 Unknown81_param_4, - .param .u64 Unknown81_param_5, - .param .u64 Unknown81_param_6, - .param .u64 Unknown81_param_7, - .param .u64 Unknown81_param_8, - .param .u64 Unknown81_param_9, - .param .u64 Unknown81_param_10, - .param .u64 Unknown81_param_11, - .param .u64 Unknown81_param_12, - .param .u64 Unknown81_param_13, - .param .u64 Unknown81_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 255; - @%p1 bra $L__BB19_2; - ld.param.u64 %rd5, [Unknown81_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown81_param_1]; - ld.param.u64 %rd7, [Unknown81_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB19_2: - ret; - -} - // .globl Unknown80 -.visible .entry Unknown80( - .param .u64 Unknown80_param_0, - .param .u64 Unknown80_param_1, - .param .u64 Unknown80_param_2, - .param .u64 Unknown80_param_3, - .param .u64 Unknown80_param_4, - .param .u64 Unknown80_param_5, - .param .u64 Unknown80_param_6, - .param .u64 Unknown80_param_7, - .param .u64 Unknown80_param_8, - .param .u64 Unknown80_param_9, - .param .u64 Unknown80_param_10, - .param .u64 Unknown80_param_11, - .param .u64 Unknown80_param_12, - .param .u64 Unknown80_param_13, - .param .u64 Unknown80_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 127; - @%p1 bra $L__BB20_2; - ld.param.u64 %rd5, [Unknown80_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown80_param_1]; - ld.param.u64 %rd7, [Unknown80_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB20_2: - ret; - -} - // .globl Unknown79 -.visible .entry Unknown79( - .param .u64 Unknown79_param_0, - .param .u64 Unknown79_param_1, - .param .u64 Unknown79_param_2, - .param .u64 Unknown79_param_3, - .param .u64 Unknown79_param_4, - .param .u64 Unknown79_param_5, - .param .u64 Unknown79_param_6, - .param .u64 Unknown79_param_7, - .param .u64 Unknown79_param_8, - .param .u64 Unknown79_param_9, - .param .u64 Unknown79_param_10, - .param .u64 Unknown79_param_11, - .param .u64 Unknown79_param_12, - .param .u64 Unknown79_param_13, - .param .u64 Unknown79_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 127; - @%p1 bra $L__BB21_2; - ld.param.u64 %rd5, [Unknown79_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown79_param_1]; - ld.param.u64 %rd7, [Unknown79_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB21_2: - ret; - -} - // .globl Unknown78 -.visible .entry Unknown78( - .param .u64 Unknown78_param_0, - .param .u64 Unknown78_param_1, - .param .u64 Unknown78_param_2, - .param .u64 Unknown78_param_3, - .param .u64 Unknown78_param_4, - .param .u64 Unknown78_param_5, - .param .u64 Unknown78_param_6, - .param .u64 Unknown78_param_7, - .param .u64 Unknown78_param_8, - .param .u64 Unknown78_param_9, - .param .u64 Unknown78_param_10, - .param .u64 Unknown78_param_11, - .param .u64 Unknown78_param_12, - .param .u64 Unknown78_param_13, - .param .u64 Unknown78_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 127; - @%p1 bra $L__BB22_2; - ld.param.u64 %rd5, [Unknown78_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown78_param_1]; - ld.param.u64 %rd7, [Unknown78_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB22_2: - ret; - -} - // .globl Unknown77 -.visible .entry Unknown77( - .param .u64 Unknown77_param_0, - .param .u64 Unknown77_param_1, - .param .u64 Unknown77_param_2, - .param .u64 Unknown77_param_3, - .param .u64 Unknown77_param_4, - .param .u64 Unknown77_param_5, - .param .u64 Unknown77_param_6, - .param .u64 Unknown77_param_7, - .param .u64 Unknown77_param_8, - .param .u64 Unknown77_param_9, - .param .u64 Unknown77_param_10, - .param .u64 Unknown77_param_11, - .param .u64 Unknown77_param_12, - .param .u64 Unknown77_param_13, - .param .u64 Unknown77_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 127; - @%p1 bra $L__BB23_2; - ld.param.u64 %rd5, [Unknown77_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown77_param_1]; - ld.param.u64 %rd7, [Unknown77_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB23_2: - ret; - -} - // .globl Unknown76 -.visible .entry Unknown76( - .param .u64 Unknown76_param_0, - .param .u64 Unknown76_param_1, - .param .u64 Unknown76_param_2, - .param .u64 Unknown76_param_3, - .param .u64 Unknown76_param_4, - .param .u64 Unknown76_param_5, - .param .u64 Unknown76_param_6, - .param .u64 Unknown76_param_7, - .param .u64 Unknown76_param_8, - .param .u64 Unknown76_param_9, - .param .u64 Unknown76_param_10, - .param .u64 Unknown76_param_11, - .param .u64 Unknown76_param_12, - .param .u64 Unknown76_param_13, - .param .u64 Unknown76_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 127; - @%p1 bra $L__BB24_2; - ld.param.u64 %rd5, [Unknown76_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown76_param_1]; - ld.param.u64 %rd7, [Unknown76_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB24_2: - ret; - -} - // .globl Unknown75 -.visible .entry Unknown75( - .param .u64 Unknown75_param_0, - .param .u64 Unknown75_param_1, - .param .u64 Unknown75_param_2, - .param .u64 Unknown75_param_3, - .param .u64 Unknown75_param_4, - .param .u64 Unknown75_param_5, - .param .u64 Unknown75_param_6, - .param .u64 Unknown75_param_7, - .param .u64 Unknown75_param_8, - .param .u64 Unknown75_param_9, - .param .u64 Unknown75_param_10, - .param .u64 Unknown75_param_11, - .param .u64 Unknown75_param_12, - .param .u64 Unknown75_param_13, - .param .u64 Unknown75_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 127; - @%p1 bra $L__BB25_2; - ld.param.u64 %rd5, [Unknown75_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown75_param_1]; - ld.param.u64 %rd7, [Unknown75_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB25_2: - ret; - -} - // .globl Unknown74 -.visible .entry Unknown74( - .param .u64 Unknown74_param_0, - .param .u64 Unknown74_param_1, - .param .u64 Unknown74_param_2, - .param .u64 Unknown74_param_3, - .param .u64 Unknown74_param_4, - .param .u64 Unknown74_param_5, - .param .u64 Unknown74_param_6, - .param .u64 Unknown74_param_7, - .param .u64 Unknown74_param_8, - .param .u64 Unknown74_param_9, - .param .u64 Unknown74_param_10, - .param .u64 Unknown74_param_11, - .param .u64 Unknown74_param_12, - .param .u64 Unknown74_param_13, - .param .u64 Unknown74_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 127; - @%p1 bra $L__BB26_2; - ld.param.u64 %rd5, [Unknown74_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown74_param_1]; - ld.param.u64 %rd7, [Unknown74_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB26_2: - ret; - -} - // .globl Unknown73 -.visible .entry Unknown73( - .param .u64 Unknown73_param_0, - .param .u64 Unknown73_param_1, - .param .u64 Unknown73_param_2, - .param .u64 Unknown73_param_3, - .param .u64 Unknown73_param_4, - .param .u64 Unknown73_param_5, - .param .u64 Unknown73_param_6, - .param .u64 Unknown73_param_7, - .param .u64 Unknown73_param_8, - .param .u64 Unknown73_param_9, - .param .u64 Unknown73_param_10, - .param .u64 Unknown73_param_11, - .param .u64 Unknown73_param_12, - .param .u64 Unknown73_param_13, - .param .u64 Unknown73_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b32 %r<5>; .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 127; - @%p1 bra $L__BB27_2; - ld.param.u64 %rd5, [Unknown73_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown73_param_1]; - ld.param.u64 %rd7, [Unknown73_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB27_2: + .reg .b64 %rd<22>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 255; + @%p1 bra $L__BB1_3; + ld.param.u64 %rd12, [Unknown82_param_11]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown82_param_1]; + ld.param.u64 %rd14, [Unknown82_param_6]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 2; + shl.b64 %rd7, %rd5, 2; +$L__BB1_2: + add.s64 %rd17, %rd2, %rd20; + ld.global.nc.f32 %f1, [%rd17]; + add.s64 %rd18, %rd3, %rd20; + ld.global.nc.f32 %f2, [%rd18]; + mul.rn.f32 %f3, %f1, 0f3F666666; + mul.rn.f32 %f4, %f2, 0f3DCCCCCD; + add.rn.f32 %f5, %f3, %f4; + add.s64 %rd19, %rd1, %rd20; + st.global.f32 [%rd19], %f5; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p2, %rd21, 256; + @%p2 bra $L__BB1_2; +$L__BB1_3: ret; } @@ -1482,504 +152,44 @@ $L__BB27_2: .param .u64 Unknown72_param_14 ) { - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 127; - @%p1 bra $L__BB28_2; - ld.param.u64 %rd5, [Unknown72_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown72_param_1]; - ld.param.u64 %rd7, [Unknown72_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB28_2: - ret; - -} - // .globl Unknown71 -.visible .entry Unknown71( - .param .u64 Unknown71_param_0, - .param .u64 Unknown71_param_1, - .param .u64 Unknown71_param_2, - .param .u64 Unknown71_param_3, - .param .u64 Unknown71_param_4, - .param .u64 Unknown71_param_5, - .param .u64 Unknown71_param_6, - .param .u64 Unknown71_param_7, - .param .u64 Unknown71_param_8, - .param .u64 Unknown71_param_9, - .param .u64 Unknown71_param_10, - .param .u64 Unknown71_param_11, - .param .u64 Unknown71_param_12, - .param .u64 Unknown71_param_13, - .param .u64 Unknown71_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 127; - @%p1 bra $L__BB29_2; - ld.param.u64 %rd5, [Unknown71_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown71_param_1]; - ld.param.u64 %rd7, [Unknown71_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB29_2: - ret; - -} - // .globl Unknown70 -.visible .entry Unknown70( - .param .u64 Unknown70_param_0, - .param .u64 Unknown70_param_1, - .param .u64 Unknown70_param_2, - .param .u64 Unknown70_param_3, - .param .u64 Unknown70_param_4, - .param .u64 Unknown70_param_5, - .param .u64 Unknown70_param_6, - .param .u64 Unknown70_param_7, - .param .u64 Unknown70_param_8, - .param .u64 Unknown70_param_9, - .param .u64 Unknown70_param_10, - .param .u64 Unknown70_param_11, - .param .u64 Unknown70_param_12, - .param .u64 Unknown70_param_13, - .param .u64 Unknown70_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 63; - @%p1 bra $L__BB30_2; - ld.param.u64 %rd5, [Unknown70_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown70_param_1]; - ld.param.u64 %rd7, [Unknown70_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB30_2: - ret; - -} - // .globl Unknown69 -.visible .entry Unknown69( - .param .u64 Unknown69_param_0, - .param .u64 Unknown69_param_1, - .param .u64 Unknown69_param_2, - .param .u64 Unknown69_param_3, - .param .u64 Unknown69_param_4, - .param .u64 Unknown69_param_5, - .param .u64 Unknown69_param_6, - .param .u64 Unknown69_param_7, - .param .u64 Unknown69_param_8, - .param .u64 Unknown69_param_9, - .param .u64 Unknown69_param_10, - .param .u64 Unknown69_param_11, - .param .u64 Unknown69_param_12, - .param .u64 Unknown69_param_13, - .param .u64 Unknown69_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 63; - @%p1 bra $L__BB31_2; - ld.param.u64 %rd5, [Unknown69_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown69_param_1]; - ld.param.u64 %rd7, [Unknown69_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB31_2: - ret; - -} - // .globl Unknown68 -.visible .entry Unknown68( - .param .u64 Unknown68_param_0, - .param .u64 Unknown68_param_1, - .param .u64 Unknown68_param_2, - .param .u64 Unknown68_param_3, - .param .u64 Unknown68_param_4, - .param .u64 Unknown68_param_5, - .param .u64 Unknown68_param_6, - .param .u64 Unknown68_param_7, - .param .u64 Unknown68_param_8, - .param .u64 Unknown68_param_9, - .param .u64 Unknown68_param_10, - .param .u64 Unknown68_param_11, - .param .u64 Unknown68_param_12, - .param .u64 Unknown68_param_13, - .param .u64 Unknown68_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 63; - @%p1 bra $L__BB32_2; - ld.param.u64 %rd5, [Unknown68_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown68_param_1]; - ld.param.u64 %rd7, [Unknown68_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB32_2: - ret; - -} - // .globl Unknown67 -.visible .entry Unknown67( - .param .u64 Unknown67_param_0, - .param .u64 Unknown67_param_1, - .param .u64 Unknown67_param_2, - .param .u64 Unknown67_param_3, - .param .u64 Unknown67_param_4, - .param .u64 Unknown67_param_5, - .param .u64 Unknown67_param_6, - .param .u64 Unknown67_param_7, - .param .u64 Unknown67_param_8, - .param .u64 Unknown67_param_9, - .param .u64 Unknown67_param_10, - .param .u64 Unknown67_param_11, - .param .u64 Unknown67_param_12, - .param .u64 Unknown67_param_13, - .param .u64 Unknown67_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 63; - @%p1 bra $L__BB33_2; - ld.param.u64 %rd5, [Unknown67_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown67_param_1]; - ld.param.u64 %rd7, [Unknown67_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB33_2: - ret; - -} - // .globl Unknown66 -.visible .entry Unknown66( - .param .u64 Unknown66_param_0, - .param .u64 Unknown66_param_1, - .param .u64 Unknown66_param_2, - .param .u64 Unknown66_param_3, - .param .u64 Unknown66_param_4, - .param .u64 Unknown66_param_5, - .param .u64 Unknown66_param_6, - .param .u64 Unknown66_param_7, - .param .u64 Unknown66_param_8, - .param .u64 Unknown66_param_9, - .param .u64 Unknown66_param_10, - .param .u64 Unknown66_param_11, - .param .u64 Unknown66_param_12, - .param .u64 Unknown66_param_13, - .param .u64 Unknown66_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 63; - @%p1 bra $L__BB34_2; - ld.param.u64 %rd5, [Unknown66_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown66_param_1]; - ld.param.u64 %rd7, [Unknown66_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB34_2: - ret; - -} - // .globl Unknown65 -.visible .entry Unknown65( - .param .u64 Unknown65_param_0, - .param .u64 Unknown65_param_1, - .param .u64 Unknown65_param_2, - .param .u64 Unknown65_param_3, - .param .u64 Unknown65_param_4, - .param .u64 Unknown65_param_5, - .param .u64 Unknown65_param_6, - .param .u64 Unknown65_param_7, - .param .u64 Unknown65_param_8, - .param .u64 Unknown65_param_9, - .param .u64 Unknown65_param_10, - .param .u64 Unknown65_param_11, - .param .u64 Unknown65_param_12, - .param .u64 Unknown65_param_13, - .param .u64 Unknown65_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 63; - @%p1 bra $L__BB35_2; - ld.param.u64 %rd5, [Unknown65_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown65_param_1]; - ld.param.u64 %rd7, [Unknown65_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB35_2: - ret; - -} - // .globl Unknown64 -.visible .entry Unknown64( - .param .u64 Unknown64_param_0, - .param .u64 Unknown64_param_1, - .param .u64 Unknown64_param_2, - .param .u64 Unknown64_param_3, - .param .u64 Unknown64_param_4, - .param .u64 Unknown64_param_5, - .param .u64 Unknown64_param_6, - .param .u64 Unknown64_param_7, - .param .u64 Unknown64_param_8, - .param .u64 Unknown64_param_9, - .param .u64 Unknown64_param_10, - .param .u64 Unknown64_param_11, - .param .u64 Unknown64_param_12, - .param .u64 Unknown64_param_13, - .param .u64 Unknown64_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 63; - @%p1 bra $L__BB36_2; - ld.param.u64 %rd5, [Unknown64_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown64_param_1]; - ld.param.u64 %rd7, [Unknown64_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB36_2: - ret; - -} - // .globl Unknown63 -.visible .entry Unknown63( - .param .u64 Unknown63_param_0, - .param .u64 Unknown63_param_1, - .param .u64 Unknown63_param_2, - .param .u64 Unknown63_param_3, - .param .u64 Unknown63_param_4, - .param .u64 Unknown63_param_5, - .param .u64 Unknown63_param_6, - .param .u64 Unknown63_param_7, - .param .u64 Unknown63_param_8, - .param .u64 Unknown63_param_9, - .param .u64 Unknown63_param_10, - .param .u64 Unknown63_param_11, - .param .u64 Unknown63_param_12, - .param .u64 Unknown63_param_13, - .param .u64 Unknown63_param_14 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b32 %r<5>; .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 63; - @%p1 bra $L__BB37_2; - ld.param.u64 %rd5, [Unknown63_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown63_param_1]; - ld.param.u64 %rd7, [Unknown63_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB37_2: + .reg .b64 %rd<22>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 127; + @%p1 bra $L__BB2_3; + ld.param.u64 %rd12, [Unknown72_param_11]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown72_param_1]; + ld.param.u64 %rd14, [Unknown72_param_6]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 2; + shl.b64 %rd7, %rd5, 2; +$L__BB2_2: + add.s64 %rd17, %rd2, %rd20; + ld.global.nc.f32 %f1, [%rd17]; + add.s64 %rd18, %rd3, %rd20; + ld.global.nc.f32 %f2, [%rd18]; + mul.rn.f32 %f3, %f1, 0f3F666666; + mul.rn.f32 %f4, %f2, 0f3DCCCCCD; + add.rn.f32 %f5, %f3, %f4; + add.s64 %rd19, %rd1, %rd20; + st.global.f32 [%rd19], %f5; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p2, %rd21, 128; + @%p2 bra $L__BB2_2; +$L__BB2_3: ret; } @@ -2002,36 +212,44 @@ $L__BB37_2: .param .u64 Unknown62_param_14 ) { - .reg .pred %p<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b32 %r<5>; .reg .f32 %f<6>; - .reg .b64 %rd<14>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 63; - @%p1 bra $L__BB38_2; - ld.param.u64 %rd5, [Unknown62_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown62_param_1]; - ld.param.u64 %rd7, [Unknown62_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB38_2: + .reg .b64 %rd<22>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 63; + @%p1 bra $L__BB3_3; + ld.param.u64 %rd12, [Unknown62_param_11]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown62_param_1]; + ld.param.u64 %rd14, [Unknown62_param_6]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 2; + shl.b64 %rd7, %rd5, 2; +$L__BB3_2: + add.s64 %rd17, %rd2, %rd20; + ld.global.nc.f32 %f1, [%rd17]; + add.s64 %rd18, %rd3, %rd20; + ld.global.nc.f32 %f2, [%rd18]; + mul.rn.f32 %f3, %f1, 0f3F666666; + mul.rn.f32 %f4, %f2, 0f3DCCCCCD; + add.rn.f32 %f5, %f3, %f4; + add.s64 %rd19, %rd1, %rd20; + st.global.f32 [%rd19], %f5; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p2, %rd21, 64; + @%p2 bra $L__BB3_2; +$L__BB3_3: ret; } @@ -2051,39 +269,54 @@ $L__BB38_2: .param .u64 Unknown61_param_11, .param .u64 Unknown61_param_12, .param .u64 Unknown61_param_13, - .param .u64 Unknown61_param_14 + .param .u64 Unknown61_param_14, + .param .u64 Unknown61_param_15, + .param .u64 Unknown61_param_16, + .param .u64 Unknown61_param_17, + .param .u64 Unknown61_param_18 ) { - .reg .pred %p<2>; - .reg .b32 %r<4>; - .reg .f32 %f<6>; - .reg .b64 %rd<14>; + .reg .pred %p<3>; + .reg .b16 %rs<4>; + .reg .b32 %r<5>; + .reg .f32 %f<2>; + .reg .b64 %rd<27>; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %ntid.x; mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 63; - @%p1 bra $L__BB39_2; - ld.param.u64 %rd5, [Unknown61_param_11]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown61_param_1]; - ld.param.u64 %rd7, [Unknown61_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 2; - add.s64 %rd11, %rd3, %rd10; - ld.global.f32 %f1, [%rd11]; - add.s64 %rd12, %rd2, %rd10; - ld.global.f32 %f2, [%rd12]; - mul.rn.f32 %f3, %f2, 0f3F666666; - mul.rn.f32 %f4, %f1, 0f3DCCCCCD; - add.rn.f32 %f5, %f4, %f3; - add.s64 %rd13, %rd1, %rd10; - st.global.f32 [%rd13], %f5; -$L__BB39_2: + cvt.s64.s32 %rd19, %r3; + mul.wide.s32 %rd20, %r2, %r1; + add.s64 %rd26, %rd20, %rd19; + setp.gt.s64 %p1, %rd26, 999; + @%p1 bra $L__BB4_3; + ld.param.u64 %rd16, [Unknown61_param_13]; + cvta.to.global.u64 %rd1, %rd16; + ld.param.u64 %rd17, [Unknown61_param_1]; + ld.param.u64 %rd18, [Unknown61_param_6]; + cvta.to.global.u64 %rd2, %rd18; + cvta.to.global.u64 %rd3, %rd17; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd21, %rd26, 2; + add.s64 %rd25, %rd3, %rd21; + shl.b64 %rd7, %rd5, 2; + shl.b64 %rd24, %rd26, 1; + shl.b64 %rd9, %rd5, 1; +$L__BB4_2: + ld.global.nc.f32 %f1, [%rd25]; + add.s64 %rd22, %rd2, %rd24; + ld.global.nc.u16 %rs1, [%rd22]; + cvt.rn.f16.f32 %rs2, %f1; + add.rn.f16 %rs3, %rs1, %rs2; + add.s64 %rd23, %rd1, %rd24; + st.global.b16 [%rd23], %rs3; + add.s64 %rd26, %rd26, %rd5; + add.s64 %rd25, %rd25, %rd7; + add.s64 %rd24, %rd24, %rd9; + setp.lt.s64 %p2, %rd26, 1000; + @%p2 bra $L__BB4_2; +$L__BB4_3: ret; } @@ -2102,45 +335,45 @@ $L__BB39_2: .param .u64 Unknown60_param_10, .param .u64 Unknown60_param_11, .param .u64 Unknown60_param_12, - .param .u64 Unknown60_param_13, - .param .u64 Unknown60_param_14, - .param .u64 Unknown60_param_15, - .param .u64 Unknown60_param_16, - .param .u64 Unknown60_param_17, - .param .u64 Unknown60_param_18 + .param .u64 Unknown60_param_13 ) { - .reg .pred %p<2>; - .reg .b16 %h<4>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<15>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 999; - @%p1 bra $L__BB40_2; - ld.param.u64 %rd5, [Unknown60_param_13]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown60_param_1]; - ld.param.u64 %rd7, [Unknown60_param_6]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - shl.b64 %rd10, %rd4, 1; - add.s64 %rd11, %rd2, %rd10; - ld.global.b16 %h1, [%rd11]; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd3, %rd12; - ld.global.f32 %f1, [%rd13]; - cvt.rn.f16.f32 %h2, %f1; - add.rn.f16 %h3, %h1, %h2; - add.s64 %rd14, %rd1, %rd10; - st.global.b16 [%rd14], %h3; -$L__BB40_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 511999; + @%p1 bra $L__BB5_3; + ld.param.u64 %rd15, [Unknown60_param_8]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown60_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB5_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 512000; + @%p2 bra $L__BB5_2; +$L__BB5_3: ret; } @@ -2160,502 +393,41 @@ $L__BB40_2: .param .u64 Unknown59_param_11, .param .u64 Unknown59_param_12, .param .u64 Unknown59_param_13 -) -{ - .reg .pred %p<3>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<27>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 511999; - @%p1 bra $L__BB41_2; - ld.param.u64 %rd4, [Unknown59_param_8]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown59_param_1]; - cvta.to.global.u64 %rd2, %rd5; - shr.s64 %rd8, %rd3, 63; - shr.u64 %rd9, %rd8, 55; - add.s64 %rd10, %rd3, %rd9; - and.b64 %rd11, %rd10, -512; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 512; - selp.b64 %rd14, %rd13, %rd12, %p2; - xor.b64 %rd15, %rd8, %rd3; - shr.s64 %rd16, %rd15, 63; - shr.u64 %rd17, %rd16, 55; - add.s64 %rd18, %rd15, %rd17; - shr.u64 %rd19, %rd18, 9; - xor.b64 %rd20, %rd19, %rd8; - shl.b64 %rd21, %rd20, 9; - add.s64 %rd22, %rd21, %rd14; - shl.b64 %rd23, %rd22, 2; - add.s64 %rd24, %rd2, %rd23; - ld.global.f32 %f1, [%rd24]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd25, %rd22, 1; - add.s64 %rd26, %rd1, %rd25; - st.global.b16 [%rd26], %h1; -$L__BB41_2: - ret; - -} - // .globl Unknown58 -.visible .entry Unknown58( - .param .u64 Unknown58_param_0, - .param .u64 Unknown58_param_1, - .param .u64 Unknown58_param_2, - .param .u64 Unknown58_param_3, - .param .u64 Unknown58_param_4, - .param .u64 Unknown58_param_5, - .param .u64 Unknown58_param_6, - .param .u64 Unknown58_param_7, - .param .u64 Unknown58_param_8, - .param .u64 Unknown58_param_9, - .param .u64 Unknown58_param_10, - .param .u64 Unknown58_param_11, - .param .u64 Unknown58_param_12, - .param .u64 Unknown58_param_13 -) -{ - .reg .pred %p<2>; - .reg .b16 %h<4>; - .reg .b32 %r<4>; - .reg .b64 %rd<11>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd7, %r3; - mul.wide.s32 %rd8, %r2, %r1; - add.s64 %rd9, %rd8, %rd7; - setp.gt.s64 %p1, %rd9, 511; - @%p1 bra $L__BB42_2; - ld.param.u64 %rd3, [Unknown58_param_8]; - cvta.to.global.u64 %rd4, %rd3; - ld.param.u64 %rd5, [Unknown58_param_1]; - cvta.to.global.u64 %rd6, %rd5; - shl.b64 %rd10, %rd9, 1; - add.s64 %rd1, %rd6, %rd10; - add.s64 %rd2, %rd4, %rd10; - ld.global.b16 %h1, [%rd1]; - mov.b16 %h2, 0x2539; - mul.rn.f16 %h3, %h1, %h2; - st.global.b16 [%rd2], %h3; -$L__BB42_2: - ret; - -} - // .globl Unknown57 -.visible .entry Unknown57( - .param .u64 Unknown57_param_0, - .param .u64 Unknown57_param_1, - .param .u64 Unknown57_param_2, - .param .u64 Unknown57_param_3, - .param .u64 Unknown57_param_4, - .param .u64 Unknown57_param_5, - .param .u64 Unknown57_param_6, - .param .u64 Unknown57_param_7, - .param .u64 Unknown57_param_8, - .param .u64 Unknown57_param_9, - .param .u64 Unknown57_param_10, - .param .u64 Unknown57_param_11, - .param .u64 Unknown57_param_12, - .param .u64 Unknown57_param_13, - .param .u64 Unknown57_param_14, - .param .u64 Unknown57_param_15, - .param .u64 Unknown57_param_16, - .param .u64 Unknown57_param_17, - .param .u64 Unknown57_param_18, - .param .u64 Unknown57_param_19, - .param .u64 Unknown57_param_20, - .param .u64 Unknown57_param_21, - .param .u64 Unknown57_param_22, - .param .u64 Unknown57_param_23, - .param .u64 Unknown57_param_24, - .param .u64 Unknown57_param_25, - .param .u64 Unknown57_param_26, - .param .u64 Unknown57_param_27, - .param .u64 Unknown57_param_28, - .param .u64 Unknown57_param_29, - .param .u64 Unknown57_param_30, - .param .u64 Unknown57_param_31, - .param .u64 Unknown57_param_32 -) -{ - .reg .pred %p<4>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 25087; - @%p1 bra $L__BB43_2; - ld.param.u64 %rd5, [Unknown57_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown57_param_1]; - ld.param.u64 %rd7, [Unknown57_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 1; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 7; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 7; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 1; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 1; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 7; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 7; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.u64 %rd37, %rd35, 1; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 49; - mul.lo.s64 %rd41, %rd32, 7; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - add.rn.f16 %h3, %h1, %h2; - cvt.f32.f16 %f1, %h3; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h4, %f2; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB43_2: - ret; - -} - // .globl Unknown55 -.visible .entry Unknown55( - .param .u64 Unknown55_param_0, - .param .u64 Unknown55_param_1, - .param .u64 Unknown55_param_2, - .param .u64 Unknown55_param_3, - .param .u64 Unknown55_param_4, - .param .u64 Unknown55_param_5, - .param .u64 Unknown55_param_6, - .param .u64 Unknown55_param_7, - .param .u64 Unknown55_param_8, - .param .u64 Unknown55_param_9, - .param .u64 Unknown55_param_10, - .param .u64 Unknown55_param_11, - .param .u64 Unknown55_param_12, - .param .u64 Unknown55_param_13, - .param .u64 Unknown55_param_14, - .param .u64 Unknown55_param_15, - .param .u64 Unknown55_param_16, - .param .u64 Unknown55_param_17, - .param .u64 Unknown55_param_18, - .param .u64 Unknown55_param_19, - .param .u64 Unknown55_param_20, - .param .u64 Unknown55_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 2359295; - @%p1 bra $L__BB44_2; - ld.param.u64 %rd4, [Unknown55_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown55_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 55; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -512; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 512; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 55; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 9; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 4608; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB44_2: - ret; - -} - // .globl Unknown54 -.visible .entry Unknown54( - .param .u64 Unknown54_param_0, - .param .u64 Unknown54_param_1, - .param .u64 Unknown54_param_2, - .param .u64 Unknown54_param_3, - .param .u64 Unknown54_param_4, - .param .u64 Unknown54_param_5, - .param .u64 Unknown54_param_6, - .param .u64 Unknown54_param_7, - .param .u64 Unknown54_param_8, - .param .u64 Unknown54_param_9, - .param .u64 Unknown54_param_10, - .param .u64 Unknown54_param_11, - .param .u64 Unknown54_param_12, - .param .u64 Unknown54_param_13, - .param .u64 Unknown54_param_14, - .param .u64 Unknown54_param_15, - .param .u64 Unknown54_param_16, - .param .u64 Unknown54_param_17, - .param .u64 Unknown54_param_18, - .param .u64 Unknown54_param_19, - .param .u64 Unknown54_param_20, - .param .u64 Unknown54_param_21 -) -{ - .reg .pred %p<4>; - .reg .b16 %h<3>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<45>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 25087; - @%p1 bra $L__BB45_2; - ld.param.u64 %rd4, [Unknown54_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown54_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 1; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 7; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 7; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 1; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 1; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 7; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 7; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd33, 1; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.lo.s64 %rd38, %rd37, 49; - mul.lo.s64 %rd39, %rd30, 7; - add.s64 %rd40, %rd39, %rd15; - add.s64 %rd41, %rd40, %rd38; - shl.b64 %rd42, %rd41, 1; - add.s64 %rd43, %rd2, %rd42; - ld.global.b16 %h1, [%rd43]; - cvt.f32.f16 %f1, %h1; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h2, %f2; - add.s64 %rd44, %rd1, %rd42; - st.global.b16 [%rd44], %h2; -$L__BB45_2: - ret; - -} - // .globl Unknown52 -.visible .entry Unknown52( - .param .u64 Unknown52_param_0, - .param .u64 Unknown52_param_1, - .param .u64 Unknown52_param_2, - .param .u64 Unknown52_param_3, - .param .u64 Unknown52_param_4, - .param .u64 Unknown52_param_5, - .param .u64 Unknown52_param_6, - .param .u64 Unknown52_param_7, - .param .u64 Unknown52_param_8, - .param .u64 Unknown52_param_9, - .param .u64 Unknown52_param_10, - .param .u64 Unknown52_param_11, - .param .u64 Unknown52_param_12, - .param .u64 Unknown52_param_13, - .param .u64 Unknown52_param_14, - .param .u64 Unknown52_param_15, - .param .u64 Unknown52_param_16, - .param .u64 Unknown52_param_17, - .param .u64 Unknown52_param_18, - .param .u64 Unknown52_param_19, - .param .u64 Unknown52_param_20, - .param .u64 Unknown52_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 2359295; - @%p1 bra $L__BB46_2; - ld.param.u64 %rd4, [Unknown52_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown52_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 55; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -512; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 512; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 55; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 9; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 4608; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB46_2: +) +{ + .reg .pred %p<3>; + .reg .b16 %rs<4>; + .reg .b32 %r<5>; + .reg .b64 %rd<19>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd13, %r3; + mul.wide.s32 %rd14, %r2, %r1; + add.s64 %rd18, %rd14, %rd13; + setp.gt.s64 %p1, %rd18, 511; + @%p1 bra $L__BB6_3; + ld.param.u64 %rd11, [Unknown59_param_8]; + cvta.to.global.u64 %rd1, %rd11; + ld.param.u64 %rd12, [Unknown59_param_1]; + cvta.to.global.u64 %rd2, %rd12; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd17, %rd18, 1; + shl.b64 %rd6, %rd4, 1; +$L__BB6_2: + add.s64 %rd15, %rd2, %rd17; + ld.global.nc.u16 %rs1, [%rd15]; + mov.b16 %rs2, 0x2539; + mul.rn.f16 %rs3, %rs1, %rs2; + add.s64 %rd16, %rd1, %rd17; + st.global.b16 [%rd16], %rs3; + add.s64 %rd18, %rd18, %rd4; + add.s64 %rd17, %rd17, %rd6; + setp.lt.s64 %p2, %rd18, 512; + @%p2 bra $L__BB6_2; +$L__BB6_3: ret; } @@ -2696,74 +468,44 @@ $L__BB46_2: .param .u64 Unknown51_param_32 ) { - .reg .pred %p<4>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 25087; - @%p1 bra $L__BB47_2; - ld.param.u64 %rd5, [Unknown51_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown51_param_1]; - ld.param.u64 %rd7, [Unknown51_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 1; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 7; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 7; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 1; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 1; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 7; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 7; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.u64 %rd37, %rd35, 1; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 49; - mul.lo.s64 %rd41, %rd32, 7; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - add.rn.f16 %h3, %h1, %h2; - cvt.f32.f16 %f1, %h3; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h4, %f2; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB47_2: + .reg .pred %p<3>; + .reg .b16 %rs<6>; + .reg .b32 %r<5>; + .reg .b64 %rd<22>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 25087; + @%p1 bra $L__BB7_3; + ld.param.u64 %rd12, [Unknown51_param_23]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown51_param_1]; + ld.param.u64 %rd14, [Unknown51_param_12]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB7_2: + add.s64 %rd17, %rd3, %rd20; + ld.global.nc.u16 %rs1, [%rd17]; + add.s64 %rd18, %rd2, %rd20; + ld.global.nc.u16 %rs2, [%rd18]; + add.rn.f16 %rs3, %rs1, %rs2; + mov.b16 %rs4, 0x0000; + max.NaN.f16 %rs5, %rs3, %rs4; + add.s64 %rd19, %rd1, %rd20; + st.global.b16 [%rd19], %rs5; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p2, %rd21, 25088; + @%p2 bra $L__BB7_2; +$L__BB7_3: ret; } @@ -2793,80 +535,42 @@ $L__BB47_2: .param .u64 Unknown49_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 2359295; - @%p1 bra $L__BB48_2; - ld.param.u64 %rd4, [Unknown49_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown49_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 55; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -512; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 512; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 55; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 9; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 4608; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB48_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 2359295; + @%p1 bra $L__BB8_3; + ld.param.u64 %rd15, [Unknown49_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown49_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB8_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 2359296; + @%p2 bra $L__BB8_2; +$L__BB8_3: ret; } @@ -2896,69 +600,39 @@ $L__BB48_2: .param .u64 Unknown48_param_21 ) { - .reg .pred %p<4>; - .reg .b16 %h<3>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<45>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 25087; - @%p1 bra $L__BB49_2; - ld.param.u64 %rd4, [Unknown48_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown48_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 1; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 7; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 7; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 1; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 1; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 7; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 7; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd33, 1; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.lo.s64 %rd38, %rd37, 49; - mul.lo.s64 %rd39, %rd30, 7; - add.s64 %rd40, %rd39, %rd15; - add.s64 %rd41, %rd40, %rd38; - shl.b64 %rd42, %rd41, 1; - add.s64 %rd43, %rd2, %rd42; - ld.global.b16 %h1, [%rd43]; - cvt.f32.f16 %f1, %h1; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h2, %f2; - add.s64 %rd44, %rd1, %rd42; - st.global.b16 [%rd44], %h2; -$L__BB49_2: + .reg .pred %p<3>; + .reg .b16 %rs<4>; + .reg .b32 %r<5>; + .reg .b64 %rd<19>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd13, %r3; + mul.wide.s32 %rd14, %r2, %r1; + add.s64 %rd18, %rd14, %rd13; + setp.gt.s64 %p1, %rd18, 25087; + @%p1 bra $L__BB9_3; + ld.param.u64 %rd11, [Unknown48_param_12]; + cvta.to.global.u64 %rd1, %rd11; + ld.param.u64 %rd12, [Unknown48_param_1]; + cvta.to.global.u64 %rd2, %rd12; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd17, %rd18, 1; + shl.b64 %rd6, %rd4, 1; +$L__BB9_2: + add.s64 %rd15, %rd2, %rd17; + ld.global.nc.u16 %rs1, [%rd15]; + mov.b16 %rs2, 0x0000; + max.NaN.f16 %rs3, %rs1, %rs2; + add.s64 %rd16, %rd1, %rd17; + st.global.b16 [%rd16], %rs3; + add.s64 %rd18, %rd18, %rd4; + add.s64 %rd17, %rd17, %rd6; + setp.lt.s64 %p2, %rd18, 25088; + @%p2 bra $L__BB9_2; +$L__BB9_3: ret; } @@ -2988,80 +662,42 @@ $L__BB49_2: .param .u64 Unknown46_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 1179647; - @%p1 bra $L__BB50_2; - ld.param.u64 %rd4, [Unknown46_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown46_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 56; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -256; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 256; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 56; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 8; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 2304; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB50_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 1179647; + @%p1 bra $L__BB10_3; + ld.param.u64 %rd15, [Unknown46_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown46_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB10_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 1179648; + @%p2 bra $L__BB10_2; +$L__BB10_3: ret; } @@ -3092,453 +728,41 @@ $L__BB50_2: ) { .reg .pred %p<3>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<27>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 131071; - @%p1 bra $L__BB51_2; - ld.param.u64 %rd4, [Unknown44_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown44_param_1]; - cvta.to.global.u64 %rd2, %rd5; - shr.s64 %rd8, %rd3, 63; - shr.u64 %rd9, %rd8, 56; - add.s64 %rd10, %rd3, %rd9; - and.b64 %rd11, %rd10, -256; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 256; - selp.b64 %rd14, %rd13, %rd12, %p2; - xor.b64 %rd15, %rd8, %rd3; - shr.s64 %rd16, %rd15, 63; - shr.u64 %rd17, %rd16, 56; - add.s64 %rd18, %rd15, %rd17; - shr.u64 %rd19, %rd18, 8; - xor.b64 %rd20, %rd19, %rd8; - shl.b64 %rd21, %rd20, 8; - add.s64 %rd22, %rd21, %rd14; - shl.b64 %rd23, %rd22, 2; - add.s64 %rd24, %rd2, %rd23; - ld.global.f32 %f1, [%rd24]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd25, %rd22, 1; - add.s64 %rd26, %rd1, %rd25; - st.global.b16 [%rd26], %h1; -$L__BB51_2: - ret; - -} - // .globl Unknown43 -.visible .entry Unknown43( - .param .u64 Unknown43_param_0, - .param .u64 Unknown43_param_1, - .param .u64 Unknown43_param_2, - .param .u64 Unknown43_param_3, - .param .u64 Unknown43_param_4, - .param .u64 Unknown43_param_5, - .param .u64 Unknown43_param_6, - .param .u64 Unknown43_param_7, - .param .u64 Unknown43_param_8, - .param .u64 Unknown43_param_9, - .param .u64 Unknown43_param_10, - .param .u64 Unknown43_param_11, - .param .u64 Unknown43_param_12, - .param .u64 Unknown43_param_13, - .param .u64 Unknown43_param_14, - .param .u64 Unknown43_param_15, - .param .u64 Unknown43_param_16, - .param .u64 Unknown43_param_17, - .param .u64 Unknown43_param_18, - .param .u64 Unknown43_param_19, - .param .u64 Unknown43_param_20, - .param .u64 Unknown43_param_21, - .param .u64 Unknown43_param_22, - .param .u64 Unknown43_param_23, - .param .u64 Unknown43_param_24, - .param .u64 Unknown43_param_25, - .param .u64 Unknown43_param_26, - .param .u64 Unknown43_param_27, - .param .u64 Unknown43_param_28, - .param .u64 Unknown43_param_29, - .param .u64 Unknown43_param_30, - .param .u64 Unknown43_param_31, - .param .u64 Unknown43_param_32 -) -{ - .reg .pred %p<4>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 50175; - @%p1 bra $L__BB52_2; - ld.param.u64 %rd5, [Unknown43_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown43_param_1]; - ld.param.u64 %rd7, [Unknown43_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 2; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 14; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 14; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 2; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 2; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 14; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 14; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 2; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 196; - mul.lo.s64 %rd41, %rd32, 14; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - add.rn.f16 %h3, %h1, %h2; - cvt.f32.f16 %f1, %h3; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h4, %f2; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB52_2: - ret; - -} - // .globl Unknown41 -.visible .entry Unknown41( - .param .u64 Unknown41_param_0, - .param .u64 Unknown41_param_1, - .param .u64 Unknown41_param_2, - .param .u64 Unknown41_param_3, - .param .u64 Unknown41_param_4, - .param .u64 Unknown41_param_5, - .param .u64 Unknown41_param_6, - .param .u64 Unknown41_param_7, - .param .u64 Unknown41_param_8, - .param .u64 Unknown41_param_9, - .param .u64 Unknown41_param_10, - .param .u64 Unknown41_param_11, - .param .u64 Unknown41_param_12, - .param .u64 Unknown41_param_13, - .param .u64 Unknown41_param_14, - .param .u64 Unknown41_param_15, - .param .u64 Unknown41_param_16, - .param .u64 Unknown41_param_17, - .param .u64 Unknown41_param_18, - .param .u64 Unknown41_param_19, - .param .u64 Unknown41_param_20, - .param .u64 Unknown41_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 589823; - @%p1 bra $L__BB53_2; - ld.param.u64 %rd4, [Unknown41_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown41_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 56; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -256; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 256; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 56; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 8; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 2304; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB53_2: - ret; - -} - // .globl Unknown40 -.visible .entry Unknown40( - .param .u64 Unknown40_param_0, - .param .u64 Unknown40_param_1, - .param .u64 Unknown40_param_2, - .param .u64 Unknown40_param_3, - .param .u64 Unknown40_param_4, - .param .u64 Unknown40_param_5, - .param .u64 Unknown40_param_6, - .param .u64 Unknown40_param_7, - .param .u64 Unknown40_param_8, - .param .u64 Unknown40_param_9, - .param .u64 Unknown40_param_10, - .param .u64 Unknown40_param_11, - .param .u64 Unknown40_param_12, - .param .u64 Unknown40_param_13, - .param .u64 Unknown40_param_14, - .param .u64 Unknown40_param_15, - .param .u64 Unknown40_param_16, - .param .u64 Unknown40_param_17, - .param .u64 Unknown40_param_18, - .param .u64 Unknown40_param_19, - .param .u64 Unknown40_param_20, - .param .u64 Unknown40_param_21 -) -{ - .reg .pred %p<4>; - .reg .b16 %h<3>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<45>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 50175; - @%p1 bra $L__BB54_2; - ld.param.u64 %rd4, [Unknown40_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown40_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 2; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 14; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 14; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 2; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 2; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 14; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 14; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.s64 %rd35, %rd33, 2; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.lo.s64 %rd38, %rd37, 196; - mul.lo.s64 %rd39, %rd30, 14; - add.s64 %rd40, %rd39, %rd15; - add.s64 %rd41, %rd40, %rd38; - shl.b64 %rd42, %rd41, 1; - add.s64 %rd43, %rd2, %rd42; - ld.global.b16 %h1, [%rd43]; - cvt.f32.f16 %f1, %h1; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h2, %f2; - add.s64 %rd44, %rd1, %rd42; - st.global.b16 [%rd44], %h2; -$L__BB54_2: - ret; - -} - // .globl Unknown38 -.visible .entry Unknown38( - .param .u64 Unknown38_param_0, - .param .u64 Unknown38_param_1, - .param .u64 Unknown38_param_2, - .param .u64 Unknown38_param_3, - .param .u64 Unknown38_param_4, - .param .u64 Unknown38_param_5, - .param .u64 Unknown38_param_6, - .param .u64 Unknown38_param_7, - .param .u64 Unknown38_param_8, - .param .u64 Unknown38_param_9, - .param .u64 Unknown38_param_10, - .param .u64 Unknown38_param_11, - .param .u64 Unknown38_param_12, - .param .u64 Unknown38_param_13, - .param .u64 Unknown38_param_14, - .param .u64 Unknown38_param_15, - .param .u64 Unknown38_param_16, - .param .u64 Unknown38_param_17, - .param .u64 Unknown38_param_18, - .param .u64 Unknown38_param_19, - .param .u64 Unknown38_param_20, - .param .u64 Unknown38_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 589823; - @%p1 bra $L__BB55_2; - ld.param.u64 %rd4, [Unknown38_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown38_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 56; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -256; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 256; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 56; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 8; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 2304; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB55_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 131071; + @%p1 bra $L__BB11_3; + ld.param.u64 %rd15, [Unknown44_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown44_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB11_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 131072; + @%p2 bra $L__BB11_2; +$L__BB11_3: ret; } @@ -3579,74 +803,44 @@ $L__BB55_2: .param .u64 Unknown37_param_32 ) { - .reg .pred %p<4>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 50175; - @%p1 bra $L__BB56_2; - ld.param.u64 %rd5, [Unknown37_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown37_param_1]; - ld.param.u64 %rd7, [Unknown37_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 2; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 14; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 14; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 2; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 2; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 14; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 14; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 2; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 196; - mul.lo.s64 %rd41, %rd32, 14; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - add.rn.f16 %h3, %h1, %h2; - cvt.f32.f16 %f1, %h3; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h4, %f2; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB56_2: + .reg .pred %p<3>; + .reg .b16 %rs<6>; + .reg .b32 %r<5>; + .reg .b64 %rd<22>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 50175; + @%p1 bra $L__BB12_3; + ld.param.u64 %rd12, [Unknown37_param_23]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown37_param_1]; + ld.param.u64 %rd14, [Unknown37_param_12]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB12_2: + add.s64 %rd17, %rd3, %rd20; + ld.global.nc.u16 %rs1, [%rd17]; + add.s64 %rd18, %rd2, %rd20; + ld.global.nc.u16 %rs2, [%rd18]; + add.rn.f16 %rs3, %rs1, %rs2; + mov.b16 %rs4, 0x0000; + max.NaN.f16 %rs5, %rs3, %rs4; + add.s64 %rd19, %rd1, %rd20; + st.global.b16 [%rd19], %rs5; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p2, %rd21, 50176; + @%p2 bra $L__BB12_2; +$L__BB12_3: ret; } @@ -3676,80 +870,42 @@ $L__BB56_2: .param .u64 Unknown35_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 589823; - @%p1 bra $L__BB57_2; - ld.param.u64 %rd4, [Unknown35_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown35_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 56; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -256; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 256; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 56; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 8; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 2304; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB57_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 589823; + @%p1 bra $L__BB13_3; + ld.param.u64 %rd15, [Unknown35_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown35_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB13_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 589824; + @%p2 bra $L__BB13_2; +$L__BB13_3: ret; } @@ -3779,69 +935,39 @@ $L__BB57_2: .param .u64 Unknown34_param_21 ) { - .reg .pred %p<4>; - .reg .b16 %h<3>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<45>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 50175; - @%p1 bra $L__BB58_2; - ld.param.u64 %rd4, [Unknown34_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown34_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 2; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 14; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 14; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 2; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 2; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 14; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 14; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.s64 %rd35, %rd33, 2; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.lo.s64 %rd38, %rd37, 196; - mul.lo.s64 %rd39, %rd30, 14; - add.s64 %rd40, %rd39, %rd15; - add.s64 %rd41, %rd40, %rd38; - shl.b64 %rd42, %rd41, 1; - add.s64 %rd43, %rd2, %rd42; - ld.global.b16 %h1, [%rd43]; - cvt.f32.f16 %f1, %h1; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h2, %f2; - add.s64 %rd44, %rd1, %rd42; - st.global.b16 [%rd44], %h2; -$L__BB58_2: + .reg .pred %p<3>; + .reg .b16 %rs<4>; + .reg .b32 %r<5>; + .reg .b64 %rd<19>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd13, %r3; + mul.wide.s32 %rd14, %r2, %r1; + add.s64 %rd18, %rd14, %rd13; + setp.gt.s64 %p1, %rd18, 50175; + @%p1 bra $L__BB14_3; + ld.param.u64 %rd11, [Unknown34_param_12]; + cvta.to.global.u64 %rd1, %rd11; + ld.param.u64 %rd12, [Unknown34_param_1]; + cvta.to.global.u64 %rd2, %rd12; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd17, %rd18, 1; + shl.b64 %rd6, %rd4, 1; +$L__BB14_2: + add.s64 %rd15, %rd2, %rd17; + ld.global.nc.u16 %rs1, [%rd15]; + mov.b16 %rs2, 0x0000; + max.NaN.f16 %rs3, %rs1, %rs2; + add.s64 %rd16, %rd1, %rd17; + st.global.b16 [%rd16], %rs3; + add.s64 %rd18, %rd18, %rd4; + add.s64 %rd17, %rd17, %rd6; + setp.lt.s64 %p2, %rd18, 50176; + @%p2 bra $L__BB14_2; +$L__BB14_3: ret; } @@ -3871,80 +997,42 @@ $L__BB58_2: .param .u64 Unknown32_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 294911; - @%p1 bra $L__BB59_2; - ld.param.u64 %rd4, [Unknown32_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown32_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 57; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -128; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 128; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 57; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 7; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 1152; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB59_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 294911; + @%p1 bra $L__BB15_3; + ld.param.u64 %rd15, [Unknown32_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown32_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB15_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 294912; + @%p2 bra $L__BB15_2; +$L__BB15_3: ret; } @@ -3975,453 +1063,41 @@ $L__BB59_2: ) { .reg .pred %p<3>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<27>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 32767; - @%p1 bra $L__BB60_2; - ld.param.u64 %rd4, [Unknown30_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown30_param_1]; - cvta.to.global.u64 %rd2, %rd5; - shr.s64 %rd8, %rd3, 63; - shr.u64 %rd9, %rd8, 57; - add.s64 %rd10, %rd3, %rd9; - and.b64 %rd11, %rd10, -128; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 128; - selp.b64 %rd14, %rd13, %rd12, %p2; - xor.b64 %rd15, %rd8, %rd3; - shr.s64 %rd16, %rd15, 63; - shr.u64 %rd17, %rd16, 57; - add.s64 %rd18, %rd15, %rd17; - shr.u64 %rd19, %rd18, 7; - xor.b64 %rd20, %rd19, %rd8; - shl.b64 %rd21, %rd20, 7; - add.s64 %rd22, %rd21, %rd14; - shl.b64 %rd23, %rd22, 2; - add.s64 %rd24, %rd2, %rd23; - ld.global.f32 %f1, [%rd24]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd25, %rd22, 1; - add.s64 %rd26, %rd1, %rd25; - st.global.b16 [%rd26], %h1; -$L__BB60_2: - ret; - -} - // .globl Unknown29 -.visible .entry Unknown29( - .param .u64 Unknown29_param_0, - .param .u64 Unknown29_param_1, - .param .u64 Unknown29_param_2, - .param .u64 Unknown29_param_3, - .param .u64 Unknown29_param_4, - .param .u64 Unknown29_param_5, - .param .u64 Unknown29_param_6, - .param .u64 Unknown29_param_7, - .param .u64 Unknown29_param_8, - .param .u64 Unknown29_param_9, - .param .u64 Unknown29_param_10, - .param .u64 Unknown29_param_11, - .param .u64 Unknown29_param_12, - .param .u64 Unknown29_param_13, - .param .u64 Unknown29_param_14, - .param .u64 Unknown29_param_15, - .param .u64 Unknown29_param_16, - .param .u64 Unknown29_param_17, - .param .u64 Unknown29_param_18, - .param .u64 Unknown29_param_19, - .param .u64 Unknown29_param_20, - .param .u64 Unknown29_param_21, - .param .u64 Unknown29_param_22, - .param .u64 Unknown29_param_23, - .param .u64 Unknown29_param_24, - .param .u64 Unknown29_param_25, - .param .u64 Unknown29_param_26, - .param .u64 Unknown29_param_27, - .param .u64 Unknown29_param_28, - .param .u64 Unknown29_param_29, - .param .u64 Unknown29_param_30, - .param .u64 Unknown29_param_31, - .param .u64 Unknown29_param_32 -) -{ - .reg .pred %p<4>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 100351; - @%p1 bra $L__BB61_2; - ld.param.u64 %rd5, [Unknown29_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown29_param_1]; - ld.param.u64 %rd7, [Unknown29_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 3; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 28; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 28; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 3; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 3; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 28; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 28; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 3; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 784; - mul.lo.s64 %rd41, %rd32, 28; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - add.rn.f16 %h3, %h1, %h2; - cvt.f32.f16 %f1, %h3; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h4, %f2; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB61_2: - ret; - -} - // .globl Unknown27 -.visible .entry Unknown27( - .param .u64 Unknown27_param_0, - .param .u64 Unknown27_param_1, - .param .u64 Unknown27_param_2, - .param .u64 Unknown27_param_3, - .param .u64 Unknown27_param_4, - .param .u64 Unknown27_param_5, - .param .u64 Unknown27_param_6, - .param .u64 Unknown27_param_7, - .param .u64 Unknown27_param_8, - .param .u64 Unknown27_param_9, - .param .u64 Unknown27_param_10, - .param .u64 Unknown27_param_11, - .param .u64 Unknown27_param_12, - .param .u64 Unknown27_param_13, - .param .u64 Unknown27_param_14, - .param .u64 Unknown27_param_15, - .param .u64 Unknown27_param_16, - .param .u64 Unknown27_param_17, - .param .u64 Unknown27_param_18, - .param .u64 Unknown27_param_19, - .param .u64 Unknown27_param_20, - .param .u64 Unknown27_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 147455; - @%p1 bra $L__BB62_2; - ld.param.u64 %rd4, [Unknown27_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown27_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 57; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -128; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 128; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 57; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 7; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 1152; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB62_2: - ret; - -} - // .globl Unknown26 -.visible .entry Unknown26( - .param .u64 Unknown26_param_0, - .param .u64 Unknown26_param_1, - .param .u64 Unknown26_param_2, - .param .u64 Unknown26_param_3, - .param .u64 Unknown26_param_4, - .param .u64 Unknown26_param_5, - .param .u64 Unknown26_param_6, - .param .u64 Unknown26_param_7, - .param .u64 Unknown26_param_8, - .param .u64 Unknown26_param_9, - .param .u64 Unknown26_param_10, - .param .u64 Unknown26_param_11, - .param .u64 Unknown26_param_12, - .param .u64 Unknown26_param_13, - .param .u64 Unknown26_param_14, - .param .u64 Unknown26_param_15, - .param .u64 Unknown26_param_16, - .param .u64 Unknown26_param_17, - .param .u64 Unknown26_param_18, - .param .u64 Unknown26_param_19, - .param .u64 Unknown26_param_20, - .param .u64 Unknown26_param_21 -) -{ - .reg .pred %p<4>; - .reg .b16 %h<3>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<45>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 100351; - @%p1 bra $L__BB63_2; - ld.param.u64 %rd4, [Unknown26_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown26_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 3; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 28; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 28; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 3; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 3; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 28; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 28; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.s64 %rd35, %rd33, 3; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.lo.s64 %rd38, %rd37, 784; - mul.lo.s64 %rd39, %rd30, 28; - add.s64 %rd40, %rd39, %rd15; - add.s64 %rd41, %rd40, %rd38; - shl.b64 %rd42, %rd41, 1; - add.s64 %rd43, %rd2, %rd42; - ld.global.b16 %h1, [%rd43]; - cvt.f32.f16 %f1, %h1; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h2, %f2; - add.s64 %rd44, %rd1, %rd42; - st.global.b16 [%rd44], %h2; -$L__BB63_2: - ret; - -} - // .globl Unknown24 -.visible .entry Unknown24( - .param .u64 Unknown24_param_0, - .param .u64 Unknown24_param_1, - .param .u64 Unknown24_param_2, - .param .u64 Unknown24_param_3, - .param .u64 Unknown24_param_4, - .param .u64 Unknown24_param_5, - .param .u64 Unknown24_param_6, - .param .u64 Unknown24_param_7, - .param .u64 Unknown24_param_8, - .param .u64 Unknown24_param_9, - .param .u64 Unknown24_param_10, - .param .u64 Unknown24_param_11, - .param .u64 Unknown24_param_12, - .param .u64 Unknown24_param_13, - .param .u64 Unknown24_param_14, - .param .u64 Unknown24_param_15, - .param .u64 Unknown24_param_16, - .param .u64 Unknown24_param_17, - .param .u64 Unknown24_param_18, - .param .u64 Unknown24_param_19, - .param .u64 Unknown24_param_20, - .param .u64 Unknown24_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 147455; - @%p1 bra $L__BB64_2; - ld.param.u64 %rd4, [Unknown24_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown24_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 57; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -128; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 128; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 57; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 7; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 1152; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB64_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 32767; + @%p1 bra $L__BB16_3; + ld.param.u64 %rd15, [Unknown30_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown30_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB16_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 32768; + @%p2 bra $L__BB16_2; +$L__BB16_3: ret; } @@ -4462,74 +1138,44 @@ $L__BB64_2: .param .u64 Unknown23_param_32 ) { - .reg .pred %p<4>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 100351; - @%p1 bra $L__BB65_2; - ld.param.u64 %rd5, [Unknown23_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown23_param_1]; - ld.param.u64 %rd7, [Unknown23_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 3; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 28; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 28; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 3; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 3; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 28; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 28; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 3; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 784; - mul.lo.s64 %rd41, %rd32, 28; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - add.rn.f16 %h3, %h1, %h2; - cvt.f32.f16 %f1, %h3; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h4, %f2; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB65_2: + .reg .pred %p<3>; + .reg .b16 %rs<6>; + .reg .b32 %r<5>; + .reg .b64 %rd<22>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 100351; + @%p1 bra $L__BB17_3; + ld.param.u64 %rd12, [Unknown23_param_23]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown23_param_1]; + ld.param.u64 %rd14, [Unknown23_param_12]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB17_2: + add.s64 %rd17, %rd3, %rd20; + ld.global.nc.u16 %rs1, [%rd17]; + add.s64 %rd18, %rd2, %rd20; + ld.global.nc.u16 %rs2, [%rd18]; + add.rn.f16 %rs3, %rs1, %rs2; + mov.b16 %rs4, 0x0000; + max.NaN.f16 %rs5, %rs3, %rs4; + add.s64 %rd19, %rd1, %rd20; + st.global.b16 [%rd19], %rs5; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p2, %rd21, 100352; + @%p2 bra $L__BB17_2; +$L__BB17_3: ret; } @@ -4559,80 +1205,42 @@ $L__BB65_2: .param .u64 Unknown21_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 147455; - @%p1 bra $L__BB66_2; - ld.param.u64 %rd4, [Unknown21_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown21_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 57; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -128; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 128; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 57; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 7; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 1152; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB66_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 147455; + @%p1 bra $L__BB18_3; + ld.param.u64 %rd15, [Unknown21_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown21_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB18_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 147456; + @%p2 bra $L__BB18_2; +$L__BB18_3: ret; } @@ -4662,69 +1270,39 @@ $L__BB66_2: .param .u64 Unknown20_param_21 ) { - .reg .pred %p<4>; - .reg .b16 %h<3>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<45>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 100351; - @%p1 bra $L__BB67_2; - ld.param.u64 %rd4, [Unknown20_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown20_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 3; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 28; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 28; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 3; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 3; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 28; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 28; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.s64 %rd35, %rd33, 3; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.lo.s64 %rd38, %rd37, 784; - mul.lo.s64 %rd39, %rd30, 28; - add.s64 %rd40, %rd39, %rd15; - add.s64 %rd41, %rd40, %rd38; - shl.b64 %rd42, %rd41, 1; - add.s64 %rd43, %rd2, %rd42; - ld.global.b16 %h1, [%rd43]; - cvt.f32.f16 %f1, %h1; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h2, %f2; - add.s64 %rd44, %rd1, %rd42; - st.global.b16 [%rd44], %h2; -$L__BB67_2: + .reg .pred %p<3>; + .reg .b16 %rs<4>; + .reg .b32 %r<5>; + .reg .b64 %rd<19>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd13, %r3; + mul.wide.s32 %rd14, %r2, %r1; + add.s64 %rd18, %rd14, %rd13; + setp.gt.s64 %p1, %rd18, 100351; + @%p1 bra $L__BB19_3; + ld.param.u64 %rd11, [Unknown20_param_12]; + cvta.to.global.u64 %rd1, %rd11; + ld.param.u64 %rd12, [Unknown20_param_1]; + cvta.to.global.u64 %rd2, %rd12; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd17, %rd18, 1; + shl.b64 %rd6, %rd4, 1; +$L__BB19_2: + add.s64 %rd15, %rd2, %rd17; + ld.global.nc.u16 %rs1, [%rd15]; + mov.b16 %rs2, 0x0000; + max.NaN.f16 %rs3, %rs1, %rs2; + add.s64 %rd16, %rd1, %rd17; + st.global.b16 [%rd16], %rs3; + add.s64 %rd18, %rd18, %rd4; + add.s64 %rd17, %rd17, %rd6; + setp.lt.s64 %p2, %rd18, 100352; + @%p2 bra $L__BB19_2; +$L__BB19_3: ret; } @@ -4754,80 +1332,42 @@ $L__BB67_2: .param .u64 Unknown18_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 73727; - @%p1 bra $L__BB68_2; - ld.param.u64 %rd4, [Unknown18_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown18_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 58; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -64; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 64; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 58; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 6; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 576; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB68_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 73727; + @%p1 bra $L__BB20_3; + ld.param.u64 %rd15, [Unknown18_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown18_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB20_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 73728; + @%p2 bra $L__BB20_2; +$L__BB20_3: ret; } @@ -4858,453 +1398,41 @@ $L__BB68_2: ) { .reg .pred %p<3>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<27>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 8191; - @%p1 bra $L__BB69_2; - ld.param.u64 %rd4, [Unknown16_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown16_param_1]; - cvta.to.global.u64 %rd2, %rd5; - shr.s64 %rd8, %rd3, 63; - shr.u64 %rd9, %rd8, 58; - add.s64 %rd10, %rd3, %rd9; - and.b64 %rd11, %rd10, -64; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 64; - selp.b64 %rd14, %rd13, %rd12, %p2; - xor.b64 %rd15, %rd8, %rd3; - shr.s64 %rd16, %rd15, 63; - shr.u64 %rd17, %rd16, 58; - add.s64 %rd18, %rd15, %rd17; - shr.u64 %rd19, %rd18, 6; - xor.b64 %rd20, %rd19, %rd8; - shl.b64 %rd21, %rd20, 6; - add.s64 %rd22, %rd21, %rd14; - shl.b64 %rd23, %rd22, 2; - add.s64 %rd24, %rd2, %rd23; - ld.global.f32 %f1, [%rd24]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd25, %rd22, 1; - add.s64 %rd26, %rd1, %rd25; - st.global.b16 [%rd26], %h1; -$L__BB69_2: - ret; - -} - // .globl Unknown15 -.visible .entry Unknown15( - .param .u64 Unknown15_param_0, - .param .u64 Unknown15_param_1, - .param .u64 Unknown15_param_2, - .param .u64 Unknown15_param_3, - .param .u64 Unknown15_param_4, - .param .u64 Unknown15_param_5, - .param .u64 Unknown15_param_6, - .param .u64 Unknown15_param_7, - .param .u64 Unknown15_param_8, - .param .u64 Unknown15_param_9, - .param .u64 Unknown15_param_10, - .param .u64 Unknown15_param_11, - .param .u64 Unknown15_param_12, - .param .u64 Unknown15_param_13, - .param .u64 Unknown15_param_14, - .param .u64 Unknown15_param_15, - .param .u64 Unknown15_param_16, - .param .u64 Unknown15_param_17, - .param .u64 Unknown15_param_18, - .param .u64 Unknown15_param_19, - .param .u64 Unknown15_param_20, - .param .u64 Unknown15_param_21, - .param .u64 Unknown15_param_22, - .param .u64 Unknown15_param_23, - .param .u64 Unknown15_param_24, - .param .u64 Unknown15_param_25, - .param .u64 Unknown15_param_26, - .param .u64 Unknown15_param_27, - .param .u64 Unknown15_param_28, - .param .u64 Unknown15_param_29, - .param .u64 Unknown15_param_30, - .param .u64 Unknown15_param_31, - .param .u64 Unknown15_param_32 -) -{ - .reg .pred %p<4>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 200703; - @%p1 bra $L__BB70_2; - ld.param.u64 %rd5, [Unknown15_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown15_param_1]; - ld.param.u64 %rd7, [Unknown15_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 4; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 56; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 56; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 4; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 4; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 56; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 56; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 4; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 3136; - mul.lo.s64 %rd41, %rd32, 56; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - add.rn.f16 %h3, %h1, %h2; - cvt.f32.f16 %f1, %h3; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h4, %f2; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB70_2: - ret; - -} - // .globl Unknown13 -.visible .entry Unknown13( - .param .u64 Unknown13_param_0, - .param .u64 Unknown13_param_1, - .param .u64 Unknown13_param_2, - .param .u64 Unknown13_param_3, - .param .u64 Unknown13_param_4, - .param .u64 Unknown13_param_5, - .param .u64 Unknown13_param_6, - .param .u64 Unknown13_param_7, - .param .u64 Unknown13_param_8, - .param .u64 Unknown13_param_9, - .param .u64 Unknown13_param_10, - .param .u64 Unknown13_param_11, - .param .u64 Unknown13_param_12, - .param .u64 Unknown13_param_13, - .param .u64 Unknown13_param_14, - .param .u64 Unknown13_param_15, - .param .u64 Unknown13_param_16, - .param .u64 Unknown13_param_17, - .param .u64 Unknown13_param_18, - .param .u64 Unknown13_param_19, - .param .u64 Unknown13_param_20, - .param .u64 Unknown13_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 36863; - @%p1 bra $L__BB71_2; - ld.param.u64 %rd4, [Unknown13_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown13_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 58; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -64; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 64; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 58; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 6; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 576; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB71_2: - ret; - -} - // .globl Unknown12 -.visible .entry Unknown12( - .param .u64 Unknown12_param_0, - .param .u64 Unknown12_param_1, - .param .u64 Unknown12_param_2, - .param .u64 Unknown12_param_3, - .param .u64 Unknown12_param_4, - .param .u64 Unknown12_param_5, - .param .u64 Unknown12_param_6, - .param .u64 Unknown12_param_7, - .param .u64 Unknown12_param_8, - .param .u64 Unknown12_param_9, - .param .u64 Unknown12_param_10, - .param .u64 Unknown12_param_11, - .param .u64 Unknown12_param_12, - .param .u64 Unknown12_param_13, - .param .u64 Unknown12_param_14, - .param .u64 Unknown12_param_15, - .param .u64 Unknown12_param_16, - .param .u64 Unknown12_param_17, - .param .u64 Unknown12_param_18, - .param .u64 Unknown12_param_19, - .param .u64 Unknown12_param_20, - .param .u64 Unknown12_param_21 -) -{ - .reg .pred %p<4>; - .reg .b16 %h<3>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<45>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 200703; - @%p1 bra $L__BB72_2; - ld.param.u64 %rd4, [Unknown12_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown12_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 4; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 56; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 56; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 4; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 4; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 56; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 56; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.s64 %rd35, %rd33, 4; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.lo.s64 %rd38, %rd37, 3136; - mul.lo.s64 %rd39, %rd30, 56; - add.s64 %rd40, %rd39, %rd15; - add.s64 %rd41, %rd40, %rd38; - shl.b64 %rd42, %rd41, 1; - add.s64 %rd43, %rd2, %rd42; - ld.global.b16 %h1, [%rd43]; - cvt.f32.f16 %f1, %h1; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h2, %f2; - add.s64 %rd44, %rd1, %rd42; - st.global.b16 [%rd44], %h2; -$L__BB72_2: - ret; - -} - // .globl Unknown10 -.visible .entry Unknown10( - .param .u64 Unknown10_param_0, - .param .u64 Unknown10_param_1, - .param .u64 Unknown10_param_2, - .param .u64 Unknown10_param_3, - .param .u64 Unknown10_param_4, - .param .u64 Unknown10_param_5, - .param .u64 Unknown10_param_6, - .param .u64 Unknown10_param_7, - .param .u64 Unknown10_param_8, - .param .u64 Unknown10_param_9, - .param .u64 Unknown10_param_10, - .param .u64 Unknown10_param_11, - .param .u64 Unknown10_param_12, - .param .u64 Unknown10_param_13, - .param .u64 Unknown10_param_14, - .param .u64 Unknown10_param_15, - .param .u64 Unknown10_param_16, - .param .u64 Unknown10_param_17, - .param .u64 Unknown10_param_18, - .param .u64 Unknown10_param_19, - .param .u64 Unknown10_param_20, - .param .u64 Unknown10_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 36863; - @%p1 bra $L__BB73_2; - ld.param.u64 %rd4, [Unknown10_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown10_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 58; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -64; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 64; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 58; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 6; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 576; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB73_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 8191; + @%p1 bra $L__BB21_3; + ld.param.u64 %rd15, [Unknown16_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown16_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB21_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 8192; + @%p2 bra $L__BB21_2; +$L__BB21_3: ret; } @@ -5345,177 +1473,44 @@ $L__BB73_2: .param .u64 Unknown9_param_32 ) { - .reg .pred %p<4>; - .reg .b16 %h<5>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<48>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd8, %r3; - mul.wide.s32 %rd9, %r2, %r1; - add.s64 %rd4, %rd9, %rd8; - setp.gt.s64 %p1, %rd4, 200703; - @%p1 bra $L__BB74_2; - ld.param.u64 %rd5, [Unknown9_param_23]; - cvta.to.global.u64 %rd1, %rd5; - ld.param.u64 %rd6, [Unknown9_param_1]; - ld.param.u64 %rd7, [Unknown9_param_12]; - cvta.to.global.u64 %rd2, %rd7; - cvta.to.global.u64 %rd3, %rd6; - mul.hi.s64 %rd10, %rd4, 5270498306774157605; - shr.u64 %rd11, %rd10, 63; - shr.s64 %rd12, %rd10, 4; - add.s64 %rd13, %rd12, %rd11; - mul.lo.s64 %rd14, %rd13, 56; - sub.s64 %rd15, %rd4, %rd14; - setp.lt.s64 %p2, %rd15, 0; - add.s64 %rd16, %rd15, 56; - selp.b64 %rd17, %rd16, %rd15, %p2; - shr.s64 %rd18, %rd4, 63; - xor.b64 %rd19, %rd18, %rd4; - mul.hi.s64 %rd20, %rd19, 5270498306774157605; - shr.u64 %rd21, %rd20, 63; - shr.s64 %rd22, %rd20, 4; - add.s64 %rd23, %rd22, %rd21; - xor.b64 %rd24, %rd23, %rd18; - mul.hi.s64 %rd25, %rd24, 5270498306774157605; - shr.u64 %rd26, %rd25, 63; - shr.s64 %rd27, %rd25, 4; - add.s64 %rd28, %rd27, %rd26; - mul.lo.s64 %rd29, %rd28, 56; - sub.s64 %rd30, %rd24, %rd29; - setp.lt.s64 %p3, %rd30, 0; - add.s64 %rd31, %rd30, 56; - selp.b64 %rd32, %rd31, %rd30, %p3; - shr.s64 %rd33, %rd24, 63; - xor.b64 %rd34, %rd33, %rd24; - mul.hi.s64 %rd35, %rd34, 5270498306774157605; - shr.u64 %rd36, %rd35, 63; - shr.s64 %rd37, %rd35, 4; - add.s64 %rd38, %rd37, %rd36; - xor.b64 %rd39, %rd38, %rd33; - mul.lo.s64 %rd40, %rd39, 3136; - mul.lo.s64 %rd41, %rd32, 56; - add.s64 %rd42, %rd41, %rd17; - add.s64 %rd43, %rd42, %rd40; - shl.b64 %rd44, %rd43, 1; - add.s64 %rd45, %rd3, %rd44; - ld.global.b16 %h1, [%rd45]; - add.s64 %rd46, %rd2, %rd44; - ld.global.b16 %h2, [%rd46]; - add.rn.f16 %h3, %h1, %h2; - cvt.f32.f16 %f1, %h3; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h4, %f2; - add.s64 %rd47, %rd1, %rd44; - st.global.b16 [%rd47], %h4; -$L__BB74_2: - ret; - -} - // .globl Unknown7 -.visible .entry Unknown7( - .param .u64 Unknown7_param_0, - .param .u64 Unknown7_param_1, - .param .u64 Unknown7_param_2, - .param .u64 Unknown7_param_3, - .param .u64 Unknown7_param_4, - .param .u64 Unknown7_param_5, - .param .u64 Unknown7_param_6, - .param .u64 Unknown7_param_7, - .param .u64 Unknown7_param_8, - .param .u64 Unknown7_param_9, - .param .u64 Unknown7_param_10, - .param .u64 Unknown7_param_11, - .param .u64 Unknown7_param_12, - .param .u64 Unknown7_param_13, - .param .u64 Unknown7_param_14, - .param .u64 Unknown7_param_15, - .param .u64 Unknown7_param_16, - .param .u64 Unknown7_param_17, - .param .u64 Unknown7_param_18, - .param .u64 Unknown7_param_19, - .param .u64 Unknown7_param_20, - .param .u64 Unknown7_param_21 -) -{ - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; - .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 36863; - @%p1 bra $L__BB75_2; - ld.param.u64 %rd4, [Unknown7_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown7_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 58; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -64; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 64; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 58; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 6; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 576; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB75_2: + .reg .pred %p<3>; + .reg .b16 %rs<6>; + .reg .b32 %r<5>; + .reg .b64 %rd<22>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd15, %r3; + mul.wide.s32 %rd16, %r2, %r1; + add.s64 %rd21, %rd16, %rd15; + setp.gt.s64 %p1, %rd21, 200703; + @%p1 bra $L__BB22_3; + ld.param.u64 %rd12, [Unknown9_param_23]; + cvta.to.global.u64 %rd1, %rd12; + ld.param.u64 %rd13, [Unknown9_param_1]; + ld.param.u64 %rd14, [Unknown9_param_12]; + cvta.to.global.u64 %rd2, %rd14; + cvta.to.global.u64 %rd3, %rd13; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd5, %r2, %r4; + shl.b64 %rd20, %rd21, 1; + shl.b64 %rd7, %rd5, 1; +$L__BB22_2: + add.s64 %rd17, %rd3, %rd20; + ld.global.nc.u16 %rs1, [%rd17]; + add.s64 %rd18, %rd2, %rd20; + ld.global.nc.u16 %rs2, [%rd18]; + add.rn.f16 %rs3, %rs1, %rs2; + mov.b16 %rs4, 0x0000; + max.NaN.f16 %rs5, %rs3, %rs4; + add.s64 %rd19, %rd1, %rd20; + st.global.b16 [%rd19], %rs5; + add.s64 %rd21, %rd21, %rd5; + add.s64 %rd20, %rd20, %rd7; + setp.lt.s64 %p2, %rd21, 200704; + @%p2 bra $L__BB22_2; +$L__BB22_3: ret; } @@ -5545,69 +1540,39 @@ $L__BB75_2: .param .u64 Unknown6_param_21 ) { - .reg .pred %p<4>; - .reg .b16 %h<3>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<45>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 200703; - @%p1 bra $L__BB76_2; - ld.param.u64 %rd4, [Unknown6_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown6_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 4; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 56; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 56; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 4; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 4; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 56; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 56; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.s64 %rd35, %rd33, 4; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.lo.s64 %rd38, %rd37, 3136; - mul.lo.s64 %rd39, %rd30, 56; - add.s64 %rd40, %rd39, %rd15; - add.s64 %rd41, %rd40, %rd38; - shl.b64 %rd42, %rd41, 1; - add.s64 %rd43, %rd2, %rd42; - ld.global.b16 %h1, [%rd43]; - cvt.f32.f16 %f1, %h1; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h2, %f2; - add.s64 %rd44, %rd1, %rd42; - st.global.b16 [%rd44], %h2; -$L__BB76_2: + .reg .pred %p<3>; + .reg .b16 %rs<4>; + .reg .b32 %r<5>; + .reg .b64 %rd<19>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd13, %r3; + mul.wide.s32 %rd14, %r2, %r1; + add.s64 %rd18, %rd14, %rd13; + setp.gt.s64 %p1, %rd18, 200703; + @%p1 bra $L__BB23_3; + ld.param.u64 %rd11, [Unknown6_param_12]; + cvta.to.global.u64 %rd1, %rd11; + ld.param.u64 %rd12, [Unknown6_param_1]; + cvta.to.global.u64 %rd2, %rd12; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd17, %rd18, 1; + shl.b64 %rd6, %rd4, 1; +$L__BB23_2: + add.s64 %rd15, %rd2, %rd17; + ld.global.nc.u16 %rs1, [%rd15]; + mov.b16 %rs2, 0x0000; + max.NaN.f16 %rs3, %rs1, %rs2; + add.s64 %rd16, %rd1, %rd17; + st.global.b16 [%rd16], %rs3; + add.s64 %rd18, %rd18, %rd4; + add.s64 %rd17, %rd17, %rd6; + setp.lt.s64 %p2, %rd18, 200704; + @%p2 bra $L__BB23_2; +$L__BB23_3: ret; } @@ -5637,80 +1602,42 @@ $L__BB76_2: .param .u64 Unknown4_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<57>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 36863; - @%p1 bra $L__BB77_2; - ld.param.u64 %rd4, [Unknown4_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown4_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 6148914691236517206; - shr.u64 %rd9, %rd8, 63; - add.s64 %rd10, %rd8, %rd9; - mul.lo.s64 %rd11, %rd10, 3; - sub.s64 %rd12, %rd3, %rd11; - setp.lt.s64 %p2, %rd12, 0; - add.s64 %rd13, %rd12, 3; - selp.b64 %rd14, %rd13, %rd12, %p2; - shr.s64 %rd15, %rd3, 63; - xor.b64 %rd16, %rd15, %rd3; - mul.hi.s64 %rd17, %rd16, 6148914691236517206; - shr.u64 %rd18, %rd17, 63; - add.s64 %rd19, %rd17, %rd18; - xor.b64 %rd20, %rd19, %rd15; - mul.hi.s64 %rd21, %rd20, 6148914691236517206; - shr.u64 %rd22, %rd21, 63; - add.s64 %rd23, %rd21, %rd22; - mul.lo.s64 %rd24, %rd23, 3; - sub.s64 %rd25, %rd20, %rd24; - setp.lt.s64 %p3, %rd25, 0; - add.s64 %rd26, %rd25, 3; - selp.b64 %rd27, %rd26, %rd25, %p3; - shr.s64 %rd28, %rd20, 63; - xor.b64 %rd29, %rd28, %rd20; - mul.hi.s64 %rd30, %rd29, 6148914691236517206; - shr.u64 %rd31, %rd30, 63; - add.s64 %rd32, %rd30, %rd31; - xor.b64 %rd33, %rd32, %rd28; - shr.s64 %rd34, %rd33, 63; - shr.u64 %rd35, %rd34, 58; - add.s64 %rd36, %rd33, %rd35; - and.b64 %rd37, %rd36, -64; - sub.s64 %rd38, %rd33, %rd37; - setp.lt.s64 %p4, %rd38, 0; - add.s64 %rd39, %rd38, 64; - selp.b64 %rd40, %rd39, %rd38, %p4; - xor.b64 %rd41, %rd34, %rd33; - shr.s64 %rd42, %rd41, 63; - shr.u64 %rd43, %rd42, 58; - add.s64 %rd44, %rd41, %rd43; - shr.s64 %rd45, %rd44, 6; - xor.b64 %rd46, %rd45, %rd34; - mul.lo.s64 %rd47, %rd46, 576; - mul.lo.s64 %rd48, %rd40, 9; - mul.lo.s64 %rd49, %rd27, 3; - add.s64 %rd50, %rd49, %rd14; - add.s64 %rd51, %rd50, %rd48; - add.s64 %rd52, %rd51, %rd47; - shl.b64 %rd53, %rd52, 2; - add.s64 %rd54, %rd2, %rd53; - ld.global.f32 %f1, [%rd54]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd55, %rd52, 1; - add.s64 %rd56, %rd1, %rd55; - st.global.b16 [%rd56], %h1; -$L__BB77_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 36863; + @%p1 bra $L__BB24_3; + ld.param.u64 %rd15, [Unknown4_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown4_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB24_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 36864; + @%p2 bra $L__BB24_2; +$L__BB24_3: ret; } @@ -5740,69 +1667,39 @@ $L__BB77_2: .param .u64 Unknown3_param_21 ) { - .reg .pred %p<4>; - .reg .b16 %h<3>; - .reg .b32 %r<4>; - .reg .f32 %f<3>; - .reg .b64 %rd<45>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 802815; - @%p1 bra $L__BB78_2; - ld.param.u64 %rd4, [Unknown3_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown3_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 5; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 112; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 112; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 5; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 5; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 112; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 112; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.s64 %rd35, %rd33, 5; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.lo.s64 %rd38, %rd37, 12544; - mul.lo.s64 %rd39, %rd30, 112; - add.s64 %rd40, %rd39, %rd15; - add.s64 %rd41, %rd40, %rd38; - shl.b64 %rd42, %rd41, 1; - add.s64 %rd43, %rd2, %rd42; - ld.global.b16 %h1, [%rd43]; - cvt.f32.f16 %f1, %h1; - max.f32 %f2, %f1, 0f00000000; - cvt.rn.f16.f32 %h2, %f2; - add.s64 %rd44, %rd1, %rd42; - st.global.b16 [%rd44], %h2; -$L__BB78_2: + .reg .pred %p<3>; + .reg .b16 %rs<4>; + .reg .b32 %r<5>; + .reg .b64 %rd<19>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd13, %r3; + mul.wide.s32 %rd14, %r2, %r1; + add.s64 %rd18, %rd14, %rd13; + setp.gt.s64 %p1, %rd18, 802815; + @%p1 bra $L__BB25_3; + ld.param.u64 %rd11, [Unknown3_param_12]; + cvta.to.global.u64 %rd1, %rd11; + ld.param.u64 %rd12, [Unknown3_param_1]; + cvta.to.global.u64 %rd2, %rd12; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd17, %rd18, 1; + shl.b64 %rd6, %rd4, 1; +$L__BB25_2: + add.s64 %rd15, %rd2, %rd17; + ld.global.nc.u16 %rs1, [%rd15]; + mov.b16 %rs2, 0x0000; + max.NaN.f16 %rs3, %rs1, %rs2; + add.s64 %rd16, %rd1, %rd17; + st.global.b16 [%rd16], %rs3; + add.s64 %rd18, %rd18, %rd4; + add.s64 %rd17, %rd17, %rd6; + setp.lt.s64 %p2, %rd18, 802816; + @%p2 bra $L__BB25_2; +$L__BB25_3: ret; } @@ -5832,84 +1729,42 @@ $L__BB78_2: .param .u64 Unknown1_param_21 ) { - .reg .pred %p<5>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<61>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 9407; - @%p1 bra $L__BB79_2; - ld.param.u64 %rd4, [Unknown1_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown1_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 1; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 7; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 7; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 1; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 1; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 7; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 7; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.s64 %rd35, %rd33, 1; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.hi.s64 %rd38, %rd37, 6148914691236517206; - shr.u64 %rd39, %rd38, 63; - add.s64 %rd40, %rd38, %rd39; - mul.lo.s64 %rd41, %rd40, 3; - sub.s64 %rd42, %rd37, %rd41; - setp.lt.s64 %p4, %rd42, 0; - add.s64 %rd43, %rd42, 3; - selp.b64 %rd44, %rd43, %rd42, %p4; - shr.s64 %rd45, %rd37, 63; - xor.b64 %rd46, %rd45, %rd37; - mul.hi.s64 %rd47, %rd46, 6148914691236517206; - shr.u64 %rd48, %rd47, 63; - add.s64 %rd49, %rd47, %rd48; - xor.b64 %rd50, %rd49, %rd45; - mul.lo.s64 %rd51, %rd50, 147; - mul.lo.s64 %rd52, %rd44, 49; - mul.lo.s64 %rd53, %rd30, 7; - add.s64 %rd54, %rd53, %rd15; - add.s64 %rd55, %rd54, %rd52; - add.s64 %rd56, %rd55, %rd51; - shl.b64 %rd57, %rd56, 2; - add.s64 %rd58, %rd2, %rd57; - ld.global.f32 %f1, [%rd58]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd59, %rd56, 1; - add.s64 %rd60, %rd1, %rd59; - st.global.b16 [%rd60], %h1; -$L__BB79_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 9407; + @%p1 bra $L__BB26_3; + ld.param.u64 %rd15, [Unknown1_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown1_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB26_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 9408; + @%p2 bra $L__BB26_2; +$L__BB26_3: ret; } @@ -5939,68 +1794,184 @@ $L__BB79_2: .param .u64 Unknown0_param_21 ) { - .reg .pred %p<4>; - .reg .b16 %h<2>; - .reg .b32 %r<4>; + .reg .pred %p<3>; + .reg .b16 %rs<2>; + .reg .b32 %r<5>; .reg .f32 %f<2>; - .reg .b64 %rd<46>; - - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mov.u32 %r3, %tid.x; - cvt.s64.s32 %rd6, %r3; - mul.wide.s32 %rd7, %r2, %r1; - add.s64 %rd3, %rd7, %rd6; - setp.gt.s64 %p1, %rd3, 150527; - @%p1 bra $L__BB80_2; - ld.param.u64 %rd4, [Unknown0_param_12]; - cvta.to.global.u64 %rd1, %rd4; - ld.param.u64 %rd5, [Unknown0_param_1]; - cvta.to.global.u64 %rd2, %rd5; - mul.hi.s64 %rd8, %rd3, 5270498306774157605; - shr.u64 %rd9, %rd8, 63; - shr.s64 %rd10, %rd8, 6; - add.s64 %rd11, %rd10, %rd9; - mul.lo.s64 %rd12, %rd11, 224; - sub.s64 %rd13, %rd3, %rd12; - setp.lt.s64 %p2, %rd13, 0; - add.s64 %rd14, %rd13, 224; - selp.b64 %rd15, %rd14, %rd13, %p2; - shr.s64 %rd16, %rd3, 63; - xor.b64 %rd17, %rd16, %rd3; - mul.hi.s64 %rd18, %rd17, 5270498306774157605; - shr.u64 %rd19, %rd18, 63; - shr.s64 %rd20, %rd18, 6; - add.s64 %rd21, %rd20, %rd19; - xor.b64 %rd22, %rd21, %rd16; - mul.hi.s64 %rd23, %rd22, 5270498306774157605; - shr.u64 %rd24, %rd23, 63; - shr.s64 %rd25, %rd23, 6; - add.s64 %rd26, %rd25, %rd24; - mul.lo.s64 %rd27, %rd26, 224; - sub.s64 %rd28, %rd22, %rd27; - setp.lt.s64 %p3, %rd28, 0; - add.s64 %rd29, %rd28, 224; - selp.b64 %rd30, %rd29, %rd28, %p3; - shr.s64 %rd31, %rd22, 63; - xor.b64 %rd32, %rd31, %rd22; - mul.hi.s64 %rd33, %rd32, 5270498306774157605; - shr.u64 %rd34, %rd33, 63; - shr.s64 %rd35, %rd33, 6; - add.s64 %rd36, %rd35, %rd34; - xor.b64 %rd37, %rd36, %rd31; - mul.lo.s64 %rd38, %rd37, 50176; - mul.lo.s64 %rd39, %rd30, 224; - add.s64 %rd40, %rd39, %rd15; - add.s64 %rd41, %rd40, %rd38; - shl.b64 %rd42, %rd41, 2; - add.s64 %rd43, %rd2, %rd42; - ld.global.f32 %f1, [%rd43]; - cvt.rn.f16.f32 %h1, %f1; - shl.b64 %rd44, %rd41, 1; - add.s64 %rd45, %rd1, %rd44; - st.global.b16 [%rd45], %h1; -$L__BB80_2: + .reg .b64 %rd<24>; + + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mov.u32 %r3, %tid.x; + cvt.s64.s32 %rd17, %r3; + mul.wide.s32 %rd18, %r2, %r1; + add.s64 %rd23, %rd18, %rd17; + setp.gt.s64 %p1, %rd23, 150527; + @%p1 bra $L__BB27_3; + ld.param.u64 %rd15, [Unknown0_param_12]; + cvta.to.global.u64 %rd1, %rd15; + ld.param.u64 %rd16, [Unknown0_param_1]; + cvta.to.global.u64 %rd2, %rd16; + mov.u32 %r4, %nctaid.x; + mul.wide.s32 %rd4, %r2, %r4; + shl.b64 %rd19, %rd23, 2; + add.s64 %rd22, %rd2, %rd19; + shl.b64 %rd6, %rd4, 2; + shl.b64 %rd20, %rd23, 1; + add.s64 %rd21, %rd1, %rd20; + shl.b64 %rd8, %rd4, 1; +$L__BB27_2: + ld.global.nc.f32 %f1, [%rd22]; + cvt.rn.f16.f32 %rs1, %f1; + st.global.b16 [%rd21], %rs1; + add.s64 %rd23, %rd23, %rd4; + add.s64 %rd22, %rd22, %rd6; + add.s64 %rd21, %rd21, %rd8; + setp.lt.s64 %p2, %rd23, 150528; + @%p2 bra $L__BB27_2; +$L__BB27_3: + ret; + +} + // .globl Unknown58_kernel +.visible .entry Unknown58_kernel( + .param .u64 Unknown58_kernel_param_0, + .param .u64 Unknown58_kernel_param_1, + .param .u64 Unknown58_kernel_param_2, + .param .u64 Unknown58_kernel_param_3, + .param .u64 Unknown58_kernel_param_4, + .param .u64 Unknown58_kernel_param_5, + .param .u64 Unknown58_kernel_param_6, + .param .u64 Unknown58_kernel_param_7, + .param .u64 Unknown58_kernel_param_8, + .param .u64 Unknown58_kernel_param_9, + .param .u64 Unknown58_kernel_param_10, + .param .u64 Unknown58_kernel_param_11 +) +{ + .reg .pred %p<8>; + .reg .b16 %rs<37>; + .reg .b32 %r<12>; + .reg .b64 %rd<55>; + // demoted variable + .shared .align 2 .b8 __wg_Unknown58_kernel_0[128]; + // demoted variable + .shared .align 2 .b8 __wg_Unknown58_kernel_1[64]; + // demoted variable + .shared .align 2 .b8 __wg_Unknown58_kernel_2[32]; + // demoted variable + .shared .align 2 .b8 __wg_Unknown58_kernel_3[16]; + // demoted variable + .shared .align 2 .b8 __wg_Unknown58_kernel_4[8]; + // demoted variable + .shared .align 2 .b8 __wg_Unknown58_kernel_5[4]; + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %tid.x; + cvt.s64.s32 %rd3, %r2; + and.b64 %rd9, %rd3, 63; + min.u64 %rd10, %rd9, 49; + min.u64 %rd11, %rd9, 48; + add.s64 %rd12, %rd11, 1; + setp.eq.s64 %p1, %rd12, %rd10; + mov.b16 %rs36, 0x0000; + @%p1 bra $L__BB28_2; + ld.param.u64 %rd7, [Unknown58_kernel_param_1]; + cvta.to.global.u64 %rd8, %rd7; + mul.wide.s32 %rd13, %r1, 98; + add.s64 %rd14, %rd8, %rd13; + shl.b64 %rd15, %rd10, 1; + add.s64 %rd4, %rd14, %rd15; + ld.global.nc.u16 %rs4, [%rd4]; + mov.b16 %rs5, 0x0000; + add.rn.f16 %rs36, %rs4, %rs5; +$L__BB28_2: + cvt.u32.u64 %r3, %rd3; + shl.b64 %rd16, %rd3, 1; + mov.u64 %rd17, __wg_Unknown58_kernel_0; + add.s64 %rd5, %rd17, %rd16; + st.shared.b16 [%rd5], %rs36; + bar.sync 0; + setp.gt.u32 %p2, %r3, 31; + mov.u64 %rd54, __wg_Unknown58_kernel_1; + @%p2 bra $L__BB28_4; + add.s64 %rd19, %rd5, %rd16; + ld.shared.b32 %r4, [%rd19]; + mov.b32 {%rs6, %rs7}, %r4; + mov.b16 %rs8, 0x0000; + add.rn.f16 %rs9, %rs6, %rs8; + add.rn.f16 %rs10, %rs7, %rs9; + add.s64 %rd21, %rd54, %rd16; + st.shared.b16 [%rd21], %rs10; +$L__BB28_4: + bar.sync 0; + setp.gt.u32 %p3, %r3, 15; + shl.b64 %rd52, %rd3, 2; + mov.u64 %rd53, __wg_Unknown58_kernel_2; + @%p3 bra $L__BB28_6; + add.s64 %rd25, %rd54, %rd52; + ld.shared.b32 %r6, [%rd25]; + mov.b32 {%rs11, %rs12}, %r6; + mov.b16 %rs13, 0x0000; + add.rn.f16 %rs14, %rs11, %rs13; + add.rn.f16 %rs15, %rs12, %rs14; + add.s64 %rd27, %rd53, %rd16; + st.shared.b16 [%rd27], %rs15; +$L__BB28_6: + bar.sync 0; + setp.gt.u32 %p4, %r3, 7; + mov.u64 %rd51, __wg_Unknown58_kernel_3; + @%p4 bra $L__BB28_8; + add.s64 %rd31, %rd53, %rd52; + ld.shared.b32 %r8, [%rd31]; + mov.b32 {%rs16, %rs17}, %r8; + mov.b16 %rs18, 0x0000; + add.rn.f16 %rs19, %rs16, %rs18; + add.rn.f16 %rs20, %rs17, %rs19; + add.s64 %rd33, %rd51, %rd16; + st.shared.b16 [%rd33], %rs20; +$L__BB28_8: + bar.sync 0; + setp.gt.u32 %p5, %r3, 3; + mov.u64 %rd49, __wg_Unknown58_kernel_4; + @%p5 bra $L__BB28_10; + add.s64 %rd37, %rd51, %rd52; + ld.shared.b16 %rs21, [%rd37]; + mov.b16 %rs22, 0x0000; + add.rn.f16 %rs23, %rs21, %rs22; + ld.shared.b16 %rs24, [%rd37+2]; + add.rn.f16 %rs25, %rs24, %rs23; + add.s64 %rd39, %rd49, %rd16; + st.shared.b16 [%rd39], %rs25; +$L__BB28_10: + bar.sync 0; + setp.gt.u32 %p6, %r3, 1; + @%p6 bra $L__BB28_12; + add.s64 %rd43, %rd49, %rd52; + ld.shared.b16 %rs26, [%rd43]; + mov.b16 %rs27, 0x0000; + add.rn.f16 %rs28, %rs26, %rs27; + ld.shared.b16 %rs29, [%rd43+2]; + add.rn.f16 %rs30, %rs29, %rs28; + mov.u64 %rd44, __wg_Unknown58_kernel_5; + add.s64 %rd45, %rd44, %rd16; + st.shared.b16 [%rd45], %rs30; +$L__BB28_12: + bar.sync 0; + setp.ne.s32 %p7, %r3, 0; + @%p7 bra $L__BB28_14; + ld.param.u64 %rd6, [Unknown58_kernel_param_8]; + cvta.to.global.u64 %rd1, %rd6; + cvt.s64.s32 %rd2, %r1; + ld.shared.b16 %rs31, [__wg_Unknown58_kernel_5]; + mov.b16 %rs32, 0x0000; + add.rn.f16 %rs33, %rs31, %rs32; + ld.shared.b16 %rs34, [__wg_Unknown58_kernel_5+2]; + add.rn.f16 %rs35, %rs34, %rs33; + shl.b64 %rd46, %rd2, 1; + add.s64 %rd47, %rd1, %rd46; + st.global.b16 [%rd47], %rs35; +$L__BB28_14: + bar.sync 0; ret; } diff --git a/compiler/test/E2E/ResNet18/FW/host_output.mlir b/compiler/test/E2E/ResNet18/FW/host_output.mlir index 9dafad4e7..35356cb24 100644 --- a/compiler/test/E2E/ResNet18/FW/host_output.mlir +++ b/compiler/test/E2E/ResNet18/FW/host_output.mlir @@ -5,182 +5,184 @@ module attributes {byre.container_module, gpu.container_module} { func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<1000xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<1000x512xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<128xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<128xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<128xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<128xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<256xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<256xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<256xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<256xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<256xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<256xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<256xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<256xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<256xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<256xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<512xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<512xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<512xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<512xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<512xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<512xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<512xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<512xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<512xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<512xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<64xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<64xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<64xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<64xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<64xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<64xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<64xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<64xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<64xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<64xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<128xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<128xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<128xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<128xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<128xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<128xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<128xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<128xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<128xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<128xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<256xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<256xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<256xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<256xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<256xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<256xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<256xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<256xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<256xf32, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<256xf32, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<512xf32, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<512xf32, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<512xf32, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<512xf32, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<512xf32, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<512xf32, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<512xf32, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<512xf32, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<512xf32, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<512xf32, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<1x3x224x224xf32, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x1000xf16, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg124: memref<64xf32, "cuda"> {byre.arg_alias_index = 0 : i64, byre.argname = "Output1", byre.argtype = 2 : i32}, %arg125: memref<64xf32, "cuda"> {byre.arg_alias_index = 1 : i64, byre.argname = "Output2", byre.argtype = 2 : i32}, %arg126: memref<64xf32, "cuda"> {byre.arg_alias_index = 5 : i64, byre.argname = "Output3", byre.argtype = 2 : i32}, %arg127: memref<64xf32, "cuda"> {byre.arg_alias_index = 6 : i64, byre.argname = "Output4", byre.argtype = 2 : i32}, %arg128: memref<64xf32, "cuda"> {byre.arg_alias_index = 7 : i64, byre.argname = "Output5", byre.argtype = 2 : i32}, %arg129: memref<64xf32, "cuda"> {byre.arg_alias_index = 8 : i64, byre.argname = "Output6", byre.argtype = 2 : i32}, %arg130: memref<64xf32, "cuda"> {byre.arg_alias_index = 11 : i64, byre.argname = "Output7", byre.argtype = 2 : i32}, %arg131: memref<64xf32, "cuda"> {byre.arg_alias_index = 12 : i64, byre.argname = "Output8", byre.argtype = 2 : i32}, %arg132: memref<64xf32, "cuda"> {byre.arg_alias_index = 13 : i64, byre.argname = "Output9", byre.argtype = 2 : i32}, %arg133: memref<64xf32, "cuda"> {byre.arg_alias_index = 14 : i64, byre.argname = "Output10", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.arg_alias_index = 17 : i64, byre.argname = "Output11", byre.argtype = 2 : i32}, %arg135: memref<128xf32, "cuda"> {byre.arg_alias_index = 18 : i64, byre.argname = "Output12", byre.argtype = 2 : i32}, %arg136: memref<128xf32, "cuda"> {byre.arg_alias_index = 19 : i64, byre.argname = "Output13", byre.argtype = 2 : i32}, %arg137: memref<128xf32, "cuda"> {byre.arg_alias_index = 20 : i64, byre.argname = "Output14", byre.argtype = 2 : i32}, %arg138: memref<128xf32, "cuda"> {byre.arg_alias_index = 24 : i64, byre.argname = "Output15", byre.argtype = 2 : i32}, %arg139: memref<128xf32, "cuda"> {byre.arg_alias_index = 25 : i64, byre.argname = "Output16", byre.argtype = 2 : i32}, %arg140: memref<128xf32, "cuda"> {byre.arg_alias_index = 26 : i64, byre.argname = "Output17", byre.argtype = 2 : i32}, %arg141: memref<128xf32, "cuda"> {byre.arg_alias_index = 27 : i64, byre.argname = "Output18", byre.argtype = 2 : i32}, %arg142: memref<128xf32, "cuda"> {byre.arg_alias_index = 28 : i64, byre.argname = "Output19", byre.argtype = 2 : i32}, %arg143: memref<128xf32, "cuda"> {byre.arg_alias_index = 29 : i64, byre.argname = "Output20", byre.argtype = 2 : i32}, %arg144: memref<256xf32, "cuda"> {byre.arg_alias_index = 32 : i64, byre.argname = "Output21", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.arg_alias_index = 33 : i64, byre.argname = "Output22", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.arg_alias_index = 34 : i64, byre.argname = "Output23", byre.argtype = 2 : i32}, %arg147: memref<256xf32, "cuda"> {byre.arg_alias_index = 35 : i64, byre.argname = "Output24", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.arg_alias_index = 39 : i64, byre.argname = "Output25", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.arg_alias_index = 40 : i64, byre.argname = "Output26", byre.argtype = 2 : i32}, %arg150: memref<256xf32, "cuda"> {byre.arg_alias_index = 41 : i64, byre.argname = "Output27", byre.argtype = 2 : i32}, %arg151: memref<256xf32, "cuda"> {byre.arg_alias_index = 42 : i64, byre.argname = "Output28", byre.argtype = 2 : i32}, %arg152: memref<256xf32, "cuda"> {byre.arg_alias_index = 43 : i64, byre.argname = "Output29", byre.argtype = 2 : i32}, %arg153: memref<256xf32, "cuda"> {byre.arg_alias_index = 44 : i64, byre.argname = "Output30", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.arg_alias_index = 47 : i64, byre.argname = "Output31", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.arg_alias_index = 48 : i64, byre.argname = "Output32", byre.argtype = 2 : i32}, %arg156: memref<512xf32, "cuda"> {byre.arg_alias_index = 49 : i64, byre.argname = "Output33", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.arg_alias_index = 50 : i64, byre.argname = "Output34", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.arg_alias_index = 54 : i64, byre.argname = "Output35", byre.argtype = 2 : i32}, %arg159: memref<512xf32, "cuda"> {byre.arg_alias_index = 55 : i64, byre.argname = "Output36", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.arg_alias_index = 56 : i64, byre.argname = "Output37", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.arg_alias_index = 57 : i64, byre.argname = "Output38", byre.argtype = 2 : i32}, %arg162: memref<512xf32, "cuda"> {byre.arg_alias_index = 58 : i64, byre.argname = "Output39", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.arg_alias_index = 59 : i64, byre.argname = "Output40", byre.argtype = 2 : i32}, %arg164: memref<64xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg165: memref<64xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg166: memref<64xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg167: memref<64xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg168: memref<64xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg169: memref<64xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg170: memref<64xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg171: memref<64xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg172: memref<64xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg173: memref<64xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg174: memref<128xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg175: memref<128xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg176: memref<128xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg177: memref<128xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg178: memref<128xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg179: memref<128xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg180: memref<128xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg181: memref<128xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg182: memref<128xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg183: memref<128xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output63", byre.argtype = 2 : i32}, %arg187: memref<256xf32, "cuda"> {byre.argname = "Output64", byre.argtype = 2 : i32}, %arg188: memref<256xf32, "cuda"> {byre.argname = "Output65", byre.argtype = 2 : i32}, %arg189: memref<256xf32, "cuda"> {byre.argname = "Output66", byre.argtype = 2 : i32}, %arg190: memref<256xf32, "cuda"> {byre.argname = "Output67", byre.argtype = 2 : i32}, %arg191: memref<256xf32, "cuda"> {byre.argname = "Output68", byre.argtype = 2 : i32}, %arg192: memref<256xf32, "cuda"> {byre.argname = "Output69", byre.argtype = 2 : i32}, %arg193: memref<256xf32, "cuda"> {byre.argname = "Output70", byre.argtype = 2 : i32}, %arg194: memref<512xf32, "cuda"> {byre.argname = "Output71", byre.argtype = 2 : i32}, %arg195: memref<512xf32, "cuda"> {byre.argname = "Output72", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output73", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output74", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output75", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output76", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output77", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output78", byre.argtype = 2 : i32}, %arg202: memref<512xf32, "cuda"> {byre.argname = "Output79", byre.argtype = 2 : i32}, %arg203: memref<512xf32, "cuda"> {byre.argname = "Output80", byre.argtype = 2 : i32}, %arg204: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Output81", byre.argtype = 2 : i32}, %arg205: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Output82", byre.argtype = 2 : i32}, %arg206: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output83", byre.argtype = 2 : i32}, %arg207: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output84", byre.argtype = 2 : i32}, %arg208: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output85", byre.argtype = 2 : i32}, %arg209: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output86", byre.argtype = 2 : i32}, %arg210: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output87", byre.argtype = 2 : i32}, %arg211: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output88", byre.argtype = 2 : i32}, %arg212: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output89", byre.argtype = 2 : i32}, %arg213: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output90", byre.argtype = 2 : i32}, %arg214: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output91", byre.argtype = 2 : i32}, %arg215: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output92", byre.argtype = 2 : i32}, %arg216: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output93", byre.argtype = 2 : i32}, %arg217: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output94", byre.argtype = 2 : i32}, %arg218: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output95", byre.argtype = 2 : i32}, %arg219: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output96", byre.argtype = 2 : i32}, %arg220: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output97", byre.argtype = 2 : i32}, %arg221: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Output98", byre.argtype = 2 : i32}, %arg222: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output99", byre.argtype = 2 : i32}, %arg223: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output100", byre.argtype = 2 : i32}, %arg224: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output101", byre.argtype = 2 : i32}, %arg225: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output102", byre.argtype = 2 : i32}, %arg226: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Output103", byre.argtype = 2 : i32}, %arg227: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output104", byre.argtype = 2 : i32}, %arg228: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output105", byre.argtype = 2 : i32}, %arg229: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output106", byre.argtype = 2 : i32}, %arg230: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output107", byre.argtype = 2 : i32}, %arg231: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output108", byre.argtype = 2 : i32}, %arg232: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output109", byre.argtype = 2 : i32}, %arg233: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output110", byre.argtype = 2 : i32}, %arg234: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output111", byre.argtype = 2 : i32}, %arg235: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Output112", byre.argtype = 2 : i32}, %arg236: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output113", byre.argtype = 2 : i32}, %arg237: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output114", byre.argtype = 2 : i32}, %arg238: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output115", byre.argtype = 2 : i32}, %arg239: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output116", byre.argtype = 2 : i32}, %arg240: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Output117", byre.argtype = 2 : i32}, %arg241: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output118", byre.argtype = 2 : i32}, %arg242: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output119", byre.argtype = 2 : i32}, %arg243: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output120", byre.argtype = 2 : i32}, %arg244: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output121", byre.argtype = 2 : i32}, %arg245: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output122", byre.argtype = 2 : i32}, %arg246: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output123", byre.argtype = 2 : i32}, %arg247: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output124", byre.argtype = 2 : i32}, %arg248: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output125", byre.argtype = 2 : i32}, %arg249: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Output126", byre.argtype = 2 : i32}, %arg250: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output127", byre.argtype = 2 : i32}, %arg251: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output128", byre.argtype = 2 : i32}, %arg252: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output129", byre.argtype = 2 : i32}, %arg253: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output130", byre.argtype = 2 : i32}, %arg254: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Output131", byre.argtype = 2 : i32}, %arg255: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output132", byre.argtype = 2 : i32}, %arg256: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output133", byre.argtype = 2 : i32}, %arg257: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output134", byre.argtype = 2 : i32}, %arg258: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output135", byre.argtype = 2 : i32}, %arg259: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output136", byre.argtype = 2 : i32}, %arg260: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output137", byre.argtype = 2 : i32}, %arg261: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output138", byre.argtype = 2 : i32}, %arg262: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output139", byre.argtype = 2 : i32}, %arg263: memref<1x512xf16, "cuda"> {byre.argname = "Output140", byre.argtype = 2 : i32}, %arg264: memref<512x1000xf16, "cuda"> {byre.argname = "Output141", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} { %alloc = memref.alloc() : memref<1838592xi8, "cuda"> - byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 128 : i32, GridSize.x = 1176 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda"> - byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 256 : i32, GridSize.x = 147 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda"> + byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg205, %arg204, %arg206) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> - %0 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> - %1 = "byre.alias"(%alloc) {device = "cuda", offset = 7424 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %2 = "byre.alias"(%alloc) {device = "cuda", offset = 7168 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 7424 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 7168 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg206, %arg1, %arg0, %0, %1, %2) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda"> byre.compute @PoolMaxOp_f16_f16(%arg207, %arg208) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg208, %arg209, %arg210) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %3 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> - %4 = "byre.alias"(%alloc) {device = "cuda", offset = 6912 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %5 = "byre.alias"(%alloc) {device = "cuda", offset = 6656 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %3 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda"> + %4 = "byre.alias"(%alloc) <{offset = 6912 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %5 = "byre.alias"(%alloc) <{offset = 6656 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg210, %arg6, %arg5, %3, %4, %5) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg211, %arg212, %arg213) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %6 = "byre.alias"(%alloc) {device = "cuda", offset = 6400 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %7 = "byre.alias"(%alloc) {device = "cuda", offset = 6144 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %6 = "byre.alias"(%alloc) <{offset = 6400 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %7 = "byre.alias"(%alloc) <{offset = 6144 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg213, %arg8, %arg7, %3, %6, %7) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg214, %arg215, %arg216) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %8 = "byre.alias"(%alloc) {device = "cuda", offset = 5888 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %9 = "byre.alias"(%alloc) {device = "cuda", offset = 5632 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %8 = "byre.alias"(%alloc) <{offset = 5888 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %9 = "byre.alias"(%alloc) <{offset = 5632 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg216, %arg12, %arg11, %3, %8, %9) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg217, %arg218, %arg219) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - %10 = "byre.alias"(%alloc) {device = "cuda", offset = 5376 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> - %11 = "byre.alias"(%alloc) {device = "cuda", offset = 0 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %10 = "byre.alias"(%alloc) <{offset = 5376 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> + %11 = "byre.alias"(%alloc) <{offset = 0 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg219, %arg14, %arg13, %3, %10, %11) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown15", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> - byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> + byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg220, %arg226, %arg227) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %12 = "byre.alias"(%alloc) {device = "cuda", offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - %13 = "byre.alias"(%alloc) {device = "cuda", offset = 256 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %14 = "byre.alias"(%alloc) {device = "cuda", offset = 768 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %12 = "byre.alias"(%alloc) <{offset = 8704 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %13 = "byre.alias"(%alloc) <{offset = 256 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %14 = "byre.alias"(%alloc) <{offset = 768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg227, %arg25, %arg24, %12, %13, %14) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg220, %arg221, %arg222) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %15 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> - %16 = "byre.alias"(%alloc) {device = "cuda", offset = 4864 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %17 = "byre.alias"(%alloc) {device = "cuda", offset = 1280 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %15 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda"> + %16 = "byre.alias"(%alloc) <{offset = 4864 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %17 = "byre.alias"(%alloc) <{offset = 1280 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg222, %arg18, %arg17, %15, %16, %17) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg223, %arg224, %arg225) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %18 = "byre.alias"(%alloc) {device = "cuda", offset = 1792 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %19 = "byre.alias"(%alloc) {device = "cuda", offset = 2304 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %18 = "byre.alias"(%alloc) <{offset = 1792 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %19 = "byre.alias"(%alloc) <{offset = 2304 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg225, %arg20, %arg19, %15, %18, %19) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg228, %arg229, %arg230) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %20 = "byre.alias"(%alloc) {device = "cuda", offset = 2816 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %21 = "byre.alias"(%alloc) {device = "cuda", offset = 3328 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %20 = "byre.alias"(%alloc) <{offset = 2816 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %21 = "byre.alias"(%alloc) <{offset = 3328 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg230, %arg27, %arg26, %15, %20, %21) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown27", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg231, %arg232, %arg233) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - %22 = "byre.alias"(%alloc) {device = "cuda", offset = 3840 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> - %23 = "byre.alias"(%alloc) {device = "cuda", offset = 4352 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %22 = "byre.alias"(%alloc) <{offset = 3840 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> + %23 = "byre.alias"(%alloc) <{offset = 4352 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg233, %arg29, %arg28, %15, %22, %23) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown29", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> + byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg234, %arg240, %arg241) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %24 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - %25 = "byre.alias"(%alloc) {device = "cuda", offset = 223744 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %26 = "byre.alias"(%alloc) {device = "cuda", offset = 1836544 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %24 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %25 = "byre.alias"(%alloc) <{offset = 223744 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %26 = "byre.alias"(%alloc) <{offset = 1836544 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg241, %arg40, %arg39, %24, %25, %26) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg234, %arg235, %arg236) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %27 = "byre.alias"(%alloc) {device = "cuda", offset = 325120 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> - %28 = "byre.alias"(%alloc) {device = "cuda", offset = 1835520 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %29 = "byre.alias"(%alloc) {device = "cuda", offset = 1834496 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %27 = "byre.alias"(%alloc) <{offset = 325120 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda"> + %28 = "byre.alias"(%alloc) <{offset = 1835520 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %29 = "byre.alias"(%alloc) <{offset = 1834496 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg236, %arg33, %arg32, %27, %28, %29) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg237, %arg238, %arg239) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %30 = "byre.alias"(%alloc) {device = "cuda", offset = 1833472 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %31 = "byre.alias"(%alloc) {device = "cuda", offset = 1837568 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %30 = "byre.alias"(%alloc) <{offset = 1833472 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %31 = "byre.alias"(%alloc) <{offset = 1837568 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg239, %arg35, %arg34, %27, %30, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown38", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg242, %arg243, %arg244) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %32 = "byre.alias"(%alloc) {device = "cuda", offset = 1832448 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %33 = "byre.alias"(%alloc) {device = "cuda", offset = 1831424 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %32 = "byre.alias"(%alloc) <{offset = 1832448 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %33 = "byre.alias"(%alloc) <{offset = 1831424 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg244, %arg42, %arg41, %24, %32, %33) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown40", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown41", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg245, %arg246, %arg247) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - %34 = "byre.alias"(%alloc) {device = "cuda", offset = 1830400 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> - %35 = "byre.alias"(%alloc) {device = "cuda", offset = 7680 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %34 = "byre.alias"(%alloc) <{offset = 1830400 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> + %35 = "byre.alias"(%alloc) <{offset = 7680 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg247, %arg44, %arg43, %24, %34, %35) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown43", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> + byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg248, %arg254, %arg255) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %36 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - %37 = "byre.alias"(%alloc) {device = "cuda", offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %38 = "byre.alias"(%alloc) {device = "cuda", offset = 209408 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %36 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %37 = "byre.alias"(%alloc) <{offset = 8704 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %38 = "byre.alias"(%alloc) <{offset = 209408 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg255, %arg55, %arg54, %36, %37, %38) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg248, %arg249, %arg250) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %39 = "byre.alias"(%alloc) {device = "cuda", offset = 274944 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> - %40 = "byre.alias"(%alloc) {device = "cuda", offset = 12800 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %41 = "byre.alias"(%alloc) {device = "cuda", offset = 10752 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %39 = "byre.alias"(%alloc) <{offset = 274944 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda"> + %40 = "byre.alias"(%alloc) <{offset = 12800 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %41 = "byre.alias"(%alloc) <{offset = 10752 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg250, %arg48, %arg47, %39, %40, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg251, %arg252, %arg253) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %42 = "byre.alias"(%alloc) {device = "cuda", offset = 211456 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %43 = "byre.alias"(%alloc) {device = "cuda", offset = 213504 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %42 = "byre.alias"(%alloc) <{offset = 211456 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %43 = "byre.alias"(%alloc) <{offset = 213504 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg253, %arg50, %arg49, %39, %42, %43) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown52", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg256, %arg257, %arg258) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %44 = "byre.alias"(%alloc) {device = "cuda", offset = 215552 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %45 = "byre.alias"(%alloc) {device = "cuda", offset = 217600 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %44 = "byre.alias"(%alloc) <{offset = 215552 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %45 = "byre.alias"(%alloc) <{offset = 217600 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg258, %arg57, %arg56, %36, %44, %45) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown54", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%arg259, %arg260, %arg261) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %46 = "byre.alias"(%alloc) {device = "cuda", offset = 219648 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> - %47 = "byre.alias"(%alloc) {device = "cuda", offset = 221696 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %46 = "byre.alias"(%alloc) <{offset = 219648 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> + %47 = "byre.alias"(%alloc) <{offset = 221696 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg261, %arg59, %arg58, %36, %46, %47) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> - %48 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%arg262, %48) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda"> - byre.compute @PTXOp(%48, %arg263) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown58", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda"> - %49 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%arg4, %49) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> - byre.compute @TransposeOp_f16_f16(%49, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda"> - %50 = "byre.alias"(%alloc) {device = "cuda", offset = 14848 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%arg263, %49, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda"> - byre.compute @PTXOp(%arg3, %50, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown60", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda"> - byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown67", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown71", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown75", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown77", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown79", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown81", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown84", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown85", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown86", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown88", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown90", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown94", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown96", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown97", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown98", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown99", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown100", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda"> + %48 = "byre.alias"(%arg262) <{offset = 0 : i64}> {device = "cuda"} : (memref<1x512x7x7xf16, "cuda">) -> memref<512x49xf16, "cuda"> + %49 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf16, "cuda"> + byre.compute @PTXOp(%48, %49) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 512 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown58_kernel"} : memref<512x49xf16, "cuda">, memref<512xf16, "cuda"> + %50 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda"> + byre.compute @PTXOp(%50, %arg263) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda"> + %51 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%arg4, %51) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> + byre.compute @TransposeOp_f16_f16(%51, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda"> + %52 = "byre.alias"(%alloc) <{offset = 14848 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%arg263, %51, %52) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda"> + byre.compute @PTXOp(%arg3, %52, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda"> + byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> byre.copy(%arg0, %arg124) {callee = "cuda2cuda", device = "cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.copy(%arg1, %arg125) {callee = "cuda2cuda", device = "cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda"> byre.copy(%arg5, %arg126) {callee = "cuda2cuda", device = "cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda"> diff --git a/compiler/test/E2E/ResNet18/Whole/2_linalg_tensor_opt.mlir b/compiler/test/E2E/ResNet18/Whole/2_linalg_tensor_opt.mlir index 8d4fa513d..53f87a4e0 100644 --- a/compiler/test/E2E/ResNet18/Whole/2_linalg_tensor_opt.mlir +++ b/compiler/test/E2E/ResNet18/Whole/2_linalg_tensor_opt.mlir @@ -21,18 +21,6 @@ module @IrToMhlo.2452 { %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> return %0 : tensor<64x64x3x3xf16> } - func.func private @Unknown4(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - return %0 : tensor<64x64x3x3xf16> - } - func.func private @Unknown5(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - return %0 : tensor<64x64x3x3xf16> - } - func.func private @Unknown6(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - return %0 : tensor<64x64x3x3xf16> - } func.func private @Unknown7(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> return %0 : tensor<128x64x1x1xf16> @@ -45,14 +33,6 @@ module @IrToMhlo.2452 { %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> return %0 : tensor<128x128x3x3xf16> } - func.func private @Unknown10(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - return %0 : tensor<128x128x3x3xf16> - } - func.func private @Unknown11(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - return %0 : tensor<128x128x3x3xf16> - } func.func private @Unknown12(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> return %0 : tensor<256x128x1x1xf16> @@ -65,14 +45,6 @@ module @IrToMhlo.2452 { %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> return %0 : tensor<256x256x3x3xf16> } - func.func private @Unknown15(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - return %0 : tensor<256x256x3x3xf16> - } - func.func private @Unknown16(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - return %0 : tensor<256x256x3x3xf16> - } func.func private @Unknown17(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> return %0 : tensor<512x256x1x1xf16> @@ -85,14 +57,6 @@ module @IrToMhlo.2452 { %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> return %0 : tensor<512x512x3x3xf16> } - func.func private @Unknown20(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - return %0 : tensor<512x512x3x3xf16> - } - func.func private @Unknown21(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - return %0 : tensor<512x512x3x3xf16> - } func.func private @Unknown22(%arg0: tensor<4x1000xf32>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<-2.500000e-01> : tensor<4x1000xf32> %1 = mhlo.multiply %arg0, %0 : tensor<4x1000xf32> @@ -103,261 +67,158 @@ module @IrToMhlo.2452 { %0 = mhlo.convert %arg0 : (tensor<1000x512xf32>) -> tensor<1000x512xf16> return %0 : tensor<1000x512xf16> } - func.func private @Unknown24(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown24(%arg0: tensor<1000xf32>) -> tensor<1000xf16> attributes {__byteir_elementwise_fusion__} { + %0 = mhlo.convert %arg0 : (tensor<1000xf32>) -> tensor<1000xf16> + return %0 : tensor<1000xf16> + } + func.func private @Unknown25(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} { + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = mhlo.reduce(%arg0 init: %0) across dimensions = [1] : (tensor<4x1000xf16>, tensor) -> tensor<4xf16> + reducer(%arg1: tensor, %arg2: tensor) { + %2 = mhlo.add %arg1, %arg2 : tensor + mhlo.return %2 : tensor + } + return %1 : tensor<4xf16> + } + func.func private @Unknown26(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x112x112xf16> %1 = mhlo.maximum %arg0, %0 : tensor<4x64x112x112xf16> %2 = mhlo.compare GT, %1, %0 : (tensor<4x64x112x112xf16>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xi1> return %1, %2 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> } - func.func private @BatchNormTrainingOp25(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @Unknown26(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16> - %1 = mhlo.maximum %arg0, %0 : tensor<4x64x56x56xf16> - %2 = mhlo.compare GT, %1, %0 : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xi1> - return %1, %2 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> - } func.func private @BatchNormTrainingOp27(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> return %1 : tensor<4x64x56x56xf16> } - func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<4x64x56x56xf16> - %2 = mhlo.maximum %1, %0 : tensor<4x64x56x56xf16> - %3 = mhlo.compare GT, %2, %0 : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xi1> - return %2, %3 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> - } - func.func private @BatchNormTrainingOp29(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16> %1 = mhlo.maximum %arg0, %0 : tensor<4x64x56x56xf16> %2 = mhlo.compare GT, %1, %0 : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xi1> return %1, %2 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> } - func.func private @BatchNormTrainingOp31(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @Unknown32(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16> %1 = mhlo.add %arg0, %arg1 : tensor<4x64x56x56xf16> %2 = mhlo.maximum %1, %0 : tensor<4x64x56x56xf16> %3 = mhlo.compare GT, %2, %0 : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xi1> return %2, %3 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> } - func.func private @BatchNormTrainingOp33(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @BatchNormTrainingOp34(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { + func.func private @BatchNormTrainingOp35(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> return %1 : tensor<4x128x28x28xf16> } - func.func private @Unknown35(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16> %1 = mhlo.maximum %arg0, %0 : tensor<4x128x28x28xf16> %2 = mhlo.compare GT, %1, %0 : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xi1> return %1, %2 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> } - func.func private @BatchNormTrainingOp36(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16> %1 = mhlo.add %arg0, %arg1 : tensor<4x128x28x28xf16> %2 = mhlo.maximum %1, %0 : tensor<4x128x28x28xf16> %3 = mhlo.compare GT, %2, %0 : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xi1> return %2, %3 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> } - func.func private @BatchNormTrainingOp38(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16> - %1 = mhlo.maximum %arg0, %0 : tensor<4x128x28x28xf16> - %2 = mhlo.compare GT, %1, %0 : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xi1> - return %1, %2 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> - } - func.func private @BatchNormTrainingOp40(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @Unknown41(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<4x128x28x28xf16> - %2 = mhlo.maximum %1, %0 : tensor<4x128x28x28xf16> - %3 = mhlo.compare GT, %2, %0 : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xi1> - return %2, %3 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> - } - func.func private @BatchNormTrainingOp42(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { + func.func private @BatchNormTrainingOp44(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> return %1 : tensor<4x256x14x14xf16> } - func.func private @BatchNormTrainingOp43(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @Unknown44(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16> %1 = mhlo.maximum %arg0, %0 : tensor<4x256x14x14xf16> %2 = mhlo.compare GT, %1, %0 : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xi1> return %1, %2 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> } - func.func private @BatchNormTrainingOp45(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<4x256x14x14xf16> - %2 = mhlo.maximum %1, %0 : tensor<4x256x14x14xf16> - %3 = mhlo.compare GT, %2, %0 : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xi1> - return %2, %3 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> - } - func.func private @BatchNormTrainingOp47(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16> - %1 = mhlo.maximum %arg0, %0 : tensor<4x256x14x14xf16> - %2 = mhlo.compare GT, %1, %0 : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xi1> - return %1, %2 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> - } - func.func private @BatchNormTrainingOp49(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @Unknown50(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16> %1 = mhlo.add %arg0, %arg1 : tensor<4x256x14x14xf16> %2 = mhlo.maximum %1, %0 : tensor<4x256x14x14xf16> %3 = mhlo.compare GT, %2, %0 : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xi1> return %2, %3 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> } - func.func private @BatchNormTrainingOp51(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { + func.func private @BatchNormTrainingOp53(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> return %1 : tensor<4x512x7x7xf16> } - func.func private @BatchNormTrainingOp52(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @Unknown53(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16> %1 = mhlo.maximum %arg0, %0 : tensor<4x512x7x7xf16> %2 = mhlo.compare GT, %1, %0 : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xi1> return %1, %2 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> } - func.func private @BatchNormTrainingOp54(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16> %1 = mhlo.add %arg0, %arg1 : tensor<4x512x7x7xf16> %2 = mhlo.maximum %1, %0 : tensor<4x512x7x7xf16> %3 = mhlo.compare GT, %2, %0 : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xi1> return %2, %3 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> } - func.func private @BatchNormTrainingOp56(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16> - %1 = mhlo.maximum %arg0, %0 : tensor<4x512x7x7xf16> - %2 = mhlo.compare GT, %1, %0 : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xi1> - return %1, %2 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> - } - func.func private @BatchNormTrainingOp58(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @Unknown59(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<4x512x7x7xf16> - %2 = mhlo.maximum %1, %0 : tensor<4x512x7x7xf16> - %3 = mhlo.compare GT, %2, %0 : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xi1> - return %2, %3 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + func.func private @Unknown62(%arg0: tensor<4x512x7x7xf16>) -> tensor<4x512xf16> attributes {__byteir_reduction_fusion__} { + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = mhlo.reduce(%arg0 init: %0) across dimensions = [3, 2] : (tensor<4x512x7x7xf16>, tensor) -> tensor<4x512xf16> + reducer(%arg1: tensor, %arg2: tensor) { + %2 = mhlo.add %arg1, %arg2 : tensor + mhlo.return %2 : tensor + } + return %1 : tensor<4x512xf16> } - func.func private @Unknown60(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown63(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<2.040100e-02> : tensor<4x512xf16> %1 = mhlo.multiply %arg0, %0 : tensor<4x512xf16> return %1 : tensor<4x512xf16> } - func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 : (tensor<1000xf32>) -> tensor<1000xf16> - %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1000xf16>) -> tensor<4x1000xf16> - %2 = mhlo.add %arg1, %1 : tensor<4x1000xf16> - return %2 : tensor<4x1000xf16> + func.func private @Unknown64(%arg0: tensor<1000xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1000xf16>) -> tensor<4x1000xf16> + %1 = mhlo.add %arg1, %0 : tensor<4x1000xf16> + return %1 : tensor<4x1000xf16> + } + func.func private @Unknown65(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} { + %0 = mhlo.constant dense<0xFC00> : tensor + %1 = mhlo.reduce(%arg0 init: %0) across dimensions = [1] : (tensor<4x1000xf16>, tensor) -> tensor<4xf16> + reducer(%arg1: tensor, %arg2: tensor) { + %2 = mhlo.maximum %arg1, %arg2 : tensor + mhlo.return %2 : tensor + } + return %1 : tensor<4xf16> } - func.func private @Unknown62(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown66(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf16>) -> tensor<4x1000xf16> %1 = mhlo.subtract %arg1, %0 : tensor<4x1000xf16> - %2 = mhlo.exponential %1 : tensor<4x1000xf16> - return %1, %2 : tensor<4x1000xf16>, tensor<4x1000xf16> + return %1 : tensor<4x1000xf16> + } + func.func private @Unknown67(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} { + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = mhlo.exponential %arg0 : tensor<4x1000xf16> + %2 = mhlo.reduce(%1 init: %0) across dimensions = [1] : (tensor<4x1000xf16>, tensor) -> tensor<4xf16> + reducer(%arg1: tensor, %arg2: tensor) { + %3 = mhlo.add %arg1, %arg2 : tensor + mhlo.return %3 : tensor + } + return %2 : tensor<4xf16> } - func.func private @Unknown63(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>, %arg4: tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown68(%arg0: tensor<4xf16>) -> tensor<4xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.log %arg0 : tensor<4xf16> - %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf16>) -> tensor<4x1000xf16> - %2 = mhlo.subtract %arg1, %1 : tensor<4x1000xf16> - %3 = mhlo.exponential %2 : tensor<4x1000xf16> - %4 = "mhlo.broadcast_in_dim"(%arg2) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf16>) -> tensor<4x1000xf16> - %5 = mhlo.multiply %3, %4 : tensor<4x1000xf16> - %6 = mhlo.subtract %arg3, %5 : tensor<4x1000xf16> - %7 = mhlo.convert %2 : (tensor<4x1000xf16>) -> tensor<4x1000xf32> - %8 = mhlo.multiply %7, %arg4 : tensor<4x1000xf32> - %9 = mhlo.convert %6 : (tensor<4x1000xf16>) -> tensor<4x1000xf32> - return %6, %8, %9 : tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32> - } - func.func private @Unknown64(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + return %0 : tensor<4xf16> + } + func.func private @Unknown69(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf16>) -> tensor<4x1000xf16> + %1 = mhlo.subtract %arg1, %0 : tensor<4x1000xf16> + %2 = mhlo.exponential %1 : tensor<4x1000xf16> + %3 = "mhlo.broadcast_in_dim"(%arg2) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf16>) -> tensor<4x1000xf16> + %4 = mhlo.multiply %2, %3 : tensor<4x1000xf16> + %5 = mhlo.subtract %arg3, %4 : tensor<4x1000xf16> + return %1, %5 : tensor<4x1000xf16>, tensor<4x1000xf16> + } + func.func private @Unknown70(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<4.900000e+01> : tensor<4x512x7x7xf16> %1 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16> %2 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x512xf16>) -> tensor<4x512x7x7xf16> @@ -365,803 +226,508 @@ module @IrToMhlo.2452 { %4 = mhlo.select %arg1, %3, %1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16> return %4 : tensor<4x512x7x7xf16> } - func.func private @BatchNormGradOp65(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %1 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) + func.func private @BatchNormGradOp71(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { + %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> + %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> + %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> + %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> } - func.func private @ConvBackwardDataOp66(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp72(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16> return %2 : tensor<4x512x7x7xf16> } - func.func private @ConvBackwardFilterOp67(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp73(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16> return %1 : tensor<512x512x3x3xf16> } - func.func private @Unknown68(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown74(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16> %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16> return %1 : tensor<4x512x7x7xf16> } - func.func private @BatchNormGradOp69(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %1 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp70(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16> - return %2 : tensor<4x512x7x7xf16> - } - func.func private @ConvBackwardFilterOp71(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown72(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown78(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16> %1 = mhlo.add %arg0, %arg1 : tensor<4x512x7x7xf16> %2 = mhlo.select %arg2, %1, %0 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16> return %2 : tensor<4x512x7x7xf16> } - func.func private @BatchNormGradOp73(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %1 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp74(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16> - return %2 : tensor<4x512x7x7xf16> - } - func.func private @ConvBackwardFilterOp75(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown76(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16> - %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @BatchNormGradOp77(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %1 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp78(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp84(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,512]{1,0,2,3}"} : (tensor<512x256x3x3xf16>) -> tensor<3x3x256x512xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,512]{1,0,2,3}"} : (tensor<3x3x256x512xf16>) -> tensor<3x3x256x512xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<3x3x256x512xf16>) -> tensor<4x256x14x14xf16> return %2 : tensor<4x256x14x14xf16> } - func.func private @ConvBackwardFilterOp79(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp85(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x256x512xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x512xf16>) -> tensor<512x256x3x3xf16> return %1 : tensor<512x256x3x3xf16> } - func.func private @BatchNormGradOp80(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %1 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp81(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp87(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,256,512]{1,0,2,3}"} : (tensor<512x256x1x1xf16>) -> tensor<1x1x256x512xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<1x1x256x512xf16>) -> tensor<4x256x14x14xf16> return %1 : tensor<4x256x14x14xf16> } - func.func private @ConvBackwardFilterOp82(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp88(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<1x1x256x512xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,256,1,1]{0,1,3,2}"} : (tensor<1x1x256x512xf16>) -> tensor<512x256x1x1xf16> return %1 : tensor<512x256x1x1xf16> } - func.func private @Unknown83(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown89(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16> %1 = mhlo.add %arg0, %arg1 : tensor<4x256x14x14xf16> %2 = mhlo.select %arg2, %1, %0 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16> return %2 : tensor<4x256x14x14xf16> } - func.func private @BatchNormGradOp84(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %1 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) + func.func private @BatchNormGradOp90(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { + %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> + %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> + %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> + %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> } - func.func private @ConvBackwardDataOp85(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp91(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16> return %2 : tensor<4x256x14x14xf16> } - func.func private @ConvBackwardFilterOp86(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp92(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16> return %1 : tensor<256x256x3x3xf16> } - func.func private @Unknown87(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown93(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16> %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16> return %1 : tensor<4x256x14x14xf16> } - func.func private @BatchNormGradOp88(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %1 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp89(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16> - return %2 : tensor<4x256x14x14xf16> - } - func.func private @ConvBackwardFilterOp90(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown91(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<4x256x14x14xf16> - %2 = mhlo.select %arg2, %1, %0 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16> - return %2 : tensor<4x256x14x14xf16> - } - func.func private @BatchNormGradOp92(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %1 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp93(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16> - return %2 : tensor<4x256x14x14xf16> - } - func.func private @ConvBackwardFilterOp94(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown95(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16> - %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @BatchNormGradOp96(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %1 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp97(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp103(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,256]{1,0,2,3}"} : (tensor<256x128x3x3xf16>) -> tensor<3x3x128x256xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,256]{1,0,2,3}"} : (tensor<3x3x128x256xf16>) -> tensor<3x3x128x256xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<3x3x128x256xf16>) -> tensor<4x128x28x28xf16> return %2 : tensor<4x128x28x28xf16> } - func.func private @ConvBackwardFilterOp98(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp104(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x128x256xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x256xf16>) -> tensor<256x128x3x3xf16> return %1 : tensor<256x128x3x3xf16> } - func.func private @BatchNormGradOp99(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %1 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp100(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp106(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,128,256]{1,0,2,3}"} : (tensor<256x128x1x1xf16>) -> tensor<1x1x128x256xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<1x1x128x256xf16>) -> tensor<4x128x28x28xf16> return %1 : tensor<4x128x28x28xf16> } - func.func private @ConvBackwardFilterOp101(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp107(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<1x1x128x256xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,128,1,1]{0,1,3,2}"} : (tensor<1x1x128x256xf16>) -> tensor<256x128x1x1xf16> return %1 : tensor<256x128x1x1xf16> } - func.func private @Unknown102(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown108(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16> %1 = mhlo.add %arg0, %arg1 : tensor<4x128x28x28xf16> %2 = mhlo.select %arg2, %1, %0 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16> return %2 : tensor<4x128x28x28xf16> } - func.func private @BatchNormGradOp103(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %1 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) + func.func private @BatchNormGradOp109(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { + %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> + %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> + %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> + %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> } - func.func private @ConvBackwardDataOp104(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp110(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16> return %2 : tensor<4x128x28x28xf16> } - func.func private @ConvBackwardFilterOp105(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp111(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16> return %1 : tensor<128x128x3x3xf16> } - func.func private @Unknown106(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown112(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16> %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16> return %1 : tensor<4x128x28x28xf16> } - func.func private @BatchNormGradOp107(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %1 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp108(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16> - return %2 : tensor<4x128x28x28xf16> - } - func.func private @ConvBackwardFilterOp109(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown110(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<4x128x28x28xf16> - %2 = mhlo.select %arg2, %1, %0 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16> - return %2 : tensor<4x128x28x28xf16> - } - func.func private @BatchNormGradOp111(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %1 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp112(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16> - return %2 : tensor<4x128x28x28xf16> - } - func.func private @ConvBackwardFilterOp113(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown114(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16> - %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @BatchNormGradOp115(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %1 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp116(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp122(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,128]{1,0,2,3}"} : (tensor<128x64x3x3xf16>) -> tensor<3x3x64x128xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,128]{1,0,2,3}"} : (tensor<3x3x64x128xf16>) -> tensor<3x3x64x128xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<3x3x64x128xf16>) -> tensor<4x64x56x56xf16> return %2 : tensor<4x64x56x56xf16> } - func.func private @ConvBackwardFilterOp117(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp123(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x64x128xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x128xf16>) -> tensor<128x64x3x3xf16> return %1 : tensor<128x64x3x3xf16> } - func.func private @BatchNormGradOp118(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %1 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp119(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp125(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,64,128]{1,0,2,3}"} : (tensor<128x64x1x1xf16>) -> tensor<1x1x64x128xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<1x1x64x128xf16>) -> tensor<4x64x56x56xf16> return %1 : tensor<4x64x56x56xf16> } - func.func private @ConvBackwardFilterOp120(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp126(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<1x1x64x128xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,64,1,1]{0,1,3,2}"} : (tensor<1x1x64x128xf16>) -> tensor<128x64x1x1xf16> return %1 : tensor<128x64x1x1xf16> } - func.func private @Unknown121(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown127(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16> %1 = mhlo.add %arg0, %arg1 : tensor<4x64x56x56xf16> %2 = mhlo.select %arg2, %1, %0 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16> return %2 : tensor<4x64x56x56xf16> } - func.func private @BatchNormGradOp122(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %1 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) + func.func private @BatchNormGradOp128(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { + %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> + %1 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> + %2 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> + %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> } - func.func private @ConvBackwardDataOp123(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp129(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16> return %2 : tensor<4x64x56x56xf16> } - func.func private @ConvBackwardFilterOp124(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp130(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> return %1 : tensor<64x64x3x3xf16> } - func.func private @Unknown125(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown131(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16> %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16> return %1 : tensor<4x64x56x56xf16> } - func.func private @BatchNormGradOp126(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %1 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp127(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16> - return %2 : tensor<4x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp128(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown129(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16> - %1 = mhlo.add %arg0, %arg1 : tensor<4x64x56x56xf16> - %2 = mhlo.select %arg2, %1, %0 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16> - return %2 : tensor<4x64x56x56xf16> - } - func.func private @BatchNormGradOp130(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %1 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp131(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16> - return %2 : tensor<4x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp132(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown133(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16> - %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @BatchNormGradOp134(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %1 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp135(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16> - return %2 : tensor<4x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp136(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown137(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown143(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.add %arg0, %arg1 : tensor<4x64x56x56xf16> return %0 : tensor<4x64x56x56xf16> } - func.func private @Unknown138(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown144(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x112x112xf16> %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16> return %1 : tensor<4x64x112x112xf16> } - func.func private @BatchNormGradOp139(%arg0: tensor<4x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32> - %1 = mhlo.convert %arg2 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32> - %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x112x112xf32>) -> (tensor<4x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>) + func.func private @BatchNormGradOp145(%arg0: tensor<4x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { + %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> + %1 = mhlo.convert %arg0 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32> + %2 = mhlo.convert %arg2 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32> + %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x112x112xf32>) -> (tensor<4x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>) %3 = mhlo.convert %grad_operand : (tensor<4x64x112x112xf32>) -> tensor<4x64x112x112xf16> return %3, %grad_scale, %grad_offset : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32> } - func.func private @ConvBackwardFilterOp140(%arg0: tensor<4x3x224x224xf16>, %arg1: tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<3> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp146(%arg0: tensor<4x3x224x224xf16>, %arg1: tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<3> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[3, 2], [3, 2]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<7x7x3x64xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,3,7,7]{0,1,3,2}"} : (tensor<7x7x3x64xf16>) -> tensor<64x3x7x7xf16> return %1 : tensor<64x3x7x7xf16> } - func.func private @Unknown141(%arg0: tensor) -> tensor attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown147(%arg0: tensor<4x1000xf16>, %arg1: tensor<4x1000xf32>) -> tensor attributes {__byteir_reduction_fusion__} { + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = mhlo.convert %arg0 : (tensor<4x1000xf16>) -> tensor<4x1000xf32> + %2 = mhlo.multiply %1, %arg1 : tensor<4x1000xf32> + %3 = mhlo.reduce(%2 init: %0) across dimensions = [0, 1] : (tensor<4x1000xf32>, tensor) -> tensor + reducer(%arg2: tensor, %arg3: tensor) { + %4 = mhlo.add %arg2, %arg3 : tensor + mhlo.return %4 : tensor + } + return %3 : tensor + } + func.func private @Unknown148(%arg0: tensor) -> tensor attributes {__byteir_elementwise_fusion__} { %0 = mhlo.constant dense<4.000000e+00> : tensor %1 = mhlo.negate %arg0 : tensor %2 = mhlo.divide %1, %0 : tensor return %2 : tensor } - func.func private @Unknown142(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown149(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[64,3,7,7]{0,1,3,2}"} : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> return %0 : tensor<64x3x7x7xf32> } - func.func private @Unknown143(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - return %0 : tensor<64x64x3x3xf32> - } - func.func private @Unknown144(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - return %0 : tensor<64x64x3x3xf32> - } - func.func private @Unknown145(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - return %0 : tensor<64x64x3x3xf32> - } - func.func private @Unknown146(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown150(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> return %0 : tensor<64x64x3x3xf32> } - func.func private @Unknown147(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown154(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[128,64,3,3]{0,1,3,2}"} : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> return %0 : tensor<128x64x3x3xf32> } - func.func private @Unknown148(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown155(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> return %0 : tensor<128x128x3x3xf32> } - func.func private @Unknown149(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown156(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[128,64,1,1]{0,1,3,2}"} : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> return %0 : tensor<128x64x1x1xf32> } - func.func private @Unknown150(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - return %0 : tensor<128x128x3x3xf32> - } - func.func private @Unknown151(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - return %0 : tensor<128x128x3x3xf32> - } - func.func private @Unknown152(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown159(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[256,128,3,3]{0,1,3,2}"} : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> return %0 : tensor<256x128x3x3xf32> } - func.func private @Unknown153(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown160(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> return %0 : tensor<256x256x3x3xf32> } - func.func private @Unknown154(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown161(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[256,128,1,1]{0,1,3,2}"} : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> return %0 : tensor<256x128x1x1xf32> } - func.func private @Unknown155(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - return %0 : tensor<256x256x3x3xf32> - } - func.func private @Unknown156(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - return %0 : tensor<256x256x3x3xf32> - } - func.func private @Unknown157(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown164(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[512,256,3,3]{0,1,3,2}"} : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> return %0 : tensor<512x256x3x3xf32> } - func.func private @Unknown158(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown165(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> return %0 : tensor<512x512x3x3xf32> } - func.func private @Unknown159(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown166(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[512,256,1,1]{0,1,3,2}"} : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> return %0 : tensor<512x256x1x1xf32> } - func.func private @Unknown160(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - return %0 : tensor<512x512x3x3xf32> - } - func.func private @Unknown161(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = mhlo.convert %arg0 {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - return %0 : tensor<512x512x3x3xf32> - } - func.func private @MatmulOp162(%arg0: tensor<4x512xf16>, %arg1: tensor<4x1000xf16>) -> tensor<1000x512xf16> attributes {__byre__lhs_contracting_dimension = 0 : i64, __byre__output_transpose, __byre__rhs_contracting_dimension = 0 : i64, byre_compute_name = "MatmulOp"} { + func.func private @MatmulOp169(%arg0: tensor<4x512xf16>, %arg1: tensor<4x1000xf16>) -> tensor<1000x512xf16> attributes {__byre__lhs_contracting_dimension = 0 : i64, __byre__output_transpose, __byre__rhs_contracting_dimension = 0 : i64, byre_compute_name = "MatmulOp"} { %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<512x1000xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>, xla_shape = "f16[1000,512]{0,1}"} : (tensor<512x1000xf16>) -> tensor<1000x512xf16> return %1 : tensor<1000x512xf16> } - func.func private @Unknown163(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown170(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 {xla_shape = "f32[1000,512]{0,1}"} : (tensor<1000x512xf16>) -> tensor<1000x512xf32> return %0 : tensor<1000x512xf32> } - func.func private @Unknown164(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown171(%arg0: tensor<4x1000xf16>) -> tensor<1000xf32> attributes {__byteir_reduction_fusion__} { + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = mhlo.convert %arg0 : (tensor<4x1000xf16>) -> tensor<4x1000xf32> + %2 = mhlo.reduce(%1 init: %0) across dimensions = [0] : (tensor<4x1000xf32>, tensor) -> tensor<1000xf32> + reducer(%arg1: tensor, %arg2: tensor) { + %3 = mhlo.add %arg1, %arg2 : tensor + mhlo.return %3 : tensor + } + return %2 : tensor<1000xf32> + } + func.func private @Unknown172(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { %0 = mhlo.convert %arg0 : (tensor<1000xf32>) -> tensor<1000xf16> %1 = mhlo.convert %0 : (tensor<1000xf16>) -> tensor<1000xf32> return %1 : tensor<1000xf32> } func.func @main(%arg0: tensor<4x3x224x224xf32>, %arg1: tensor<4x1000xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64x64x3x3xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<64xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64x64x3x3xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64xf32>, %arg16: tensor<64xf32>, %arg17: tensor<64x64x3x3xf32>, %arg18: tensor<64xf32>, %arg19: tensor<64xf32>, %arg20: tensor<64xf32>, %arg21: tensor<64xf32>, %arg22: tensor<64x64x3x3xf32>, %arg23: tensor<64xf32>, %arg24: tensor<64xf32>, %arg25: tensor<64xf32>, %arg26: tensor<64xf32>, %arg27: tensor<128x64x3x3xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128xf32>, %arg31: tensor<128xf32>, %arg32: tensor<128x128x3x3xf32>, %arg33: tensor<128xf32>, %arg34: tensor<128xf32>, %arg35: tensor<128xf32>, %arg36: tensor<128xf32>, %arg37: tensor<128x64x1x1xf32>, %arg38: tensor<128xf32>, %arg39: tensor<128xf32>, %arg40: tensor<128xf32>, %arg41: tensor<128xf32>, %arg42: tensor<128x128x3x3xf32>, %arg43: tensor<128xf32>, %arg44: tensor<128xf32>, %arg45: tensor<128xf32>, %arg46: tensor<128xf32>, %arg47: tensor<128x128x3x3xf32>, %arg48: tensor<128xf32>, %arg49: tensor<128xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<256x128x3x3xf32>, %arg53: tensor<256xf32>, %arg54: tensor<256xf32>, %arg55: tensor<256xf32>, %arg56: tensor<256xf32>, %arg57: tensor<256x256x3x3xf32>, %arg58: tensor<256xf32>, %arg59: tensor<256xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256x128x1x1xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256x256x3x3xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<256xf32>, %arg71: tensor<256xf32>, %arg72: tensor<256x256x3x3xf32>, %arg73: tensor<256xf32>, %arg74: tensor<256xf32>, %arg75: tensor<256xf32>, %arg76: tensor<256xf32>, %arg77: tensor<512x256x3x3xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<512xf32>, %arg81: tensor<512xf32>, %arg82: tensor<512x512x3x3xf32>, %arg83: tensor<512xf32>, %arg84: tensor<512xf32>, %arg85: tensor<512xf32>, %arg86: tensor<512xf32>, %arg87: tensor<512x256x1x1xf32>, %arg88: tensor<512xf32>, %arg89: tensor<512xf32>, %arg90: tensor<512xf32>, %arg91: tensor<512xf32>, %arg92: tensor<512x512x3x3xf32>, %arg93: tensor<512xf32>, %arg94: tensor<512xf32>, %arg95: tensor<512xf32>, %arg96: tensor<512xf32>, %arg97: tensor<512x512x3x3xf32>, %arg98: tensor<512xf32>, %arg99: tensor<512xf32>, %arg100: tensor<512xf32>, %arg101: tensor<512xf32>, %arg102: tensor<1000x512xf32>, %arg103: tensor<1000xf32>) -> (tensor, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>) { - %0 = mhlo.constant dense<0.000000e+00> : tensor - %1 = mhlo.constant dense<0.000000e+00> : tensor - %2 = mhlo.constant dense<0xFC00> : tensor - %3 = call @Unknown0(%arg0) : (tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16> - %4 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> - %5 = mhlo.convolution(%3, %4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<4x64x112x112xf16> - %6 = call @BatchNormTrainingOp2(%5, %arg3, %arg4) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x112x112xf16> - %7 = call @Unknown3(%arg7) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %8 = call @Unknown4(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %9 = call @Unknown5(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %10 = call @Unknown6(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %11 = call @Unknown7(%arg37) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> - %12 = call @Unknown8(%arg27) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> - %13 = call @Unknown9(%arg32) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %14 = call @Unknown10(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %15 = call @Unknown11(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %16 = call @Unknown12(%arg62) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> - %17 = call @Unknown13(%arg52) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> - %18 = call @Unknown14(%arg57) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %19 = call @Unknown15(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %20 = call @Unknown16(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %21 = call @Unknown17(%arg87) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> - %22 = call @Unknown18(%arg77) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> - %23 = call @Unknown19(%arg82) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %24 = call @Unknown20(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %25 = call @Unknown21(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %26 = call @Unknown22(%arg1) : (tensor<4x1000xf32>) -> tensor<4x1000xf16> - %27 = call @Unknown23(%arg102) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> - %28 = mhlo.reduce(%26 init: %1) across dimensions = [1] : (tensor<4x1000xf16>, tensor) -> tensor<4xf16> - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %29:2 = call @Unknown24(%6) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) - %30 = "mhlo.reduce_window"(%29#0, %2) ({ + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = mhlo.constant dense<0xFC00> : tensor + %2 = call @Unknown0(%arg0) : (tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16> + %3 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> + %4 = mhlo.convolution(%2, %3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<4x64x112x112xf16> + %5 = call @BatchNormTrainingOp2(%4, %arg3, %arg4) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x112x112xf16> + %6 = call @Unknown3(%arg7) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %7 = call @Unknown3(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %8 = call @Unknown3(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %9 = call @Unknown3(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %10 = call @Unknown7(%arg37) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> + %11 = call @Unknown8(%arg27) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> + %12 = call @Unknown9(%arg32) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %13 = call @Unknown9(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %14 = call @Unknown9(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %15 = call @Unknown12(%arg62) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> + %16 = call @Unknown13(%arg52) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> + %17 = call @Unknown14(%arg57) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %18 = call @Unknown14(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %19 = call @Unknown14(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %20 = call @Unknown17(%arg87) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> + %21 = call @Unknown18(%arg77) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> + %22 = call @Unknown19(%arg82) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %23 = call @Unknown19(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %24 = call @Unknown19(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %25 = call @Unknown22(%arg1) : (tensor<4x1000xf32>) -> tensor<4x1000xf16> + %26 = call @Unknown23(%arg102) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> + %27 = call @Unknown24(%arg103) : (tensor<1000xf32>) -> tensor<1000xf16> + %28 = call @Unknown25(%25) : (tensor<4x1000xf16>) -> tensor<4xf16> + %29:2 = call @Unknown26(%5) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) + %30 = "mhlo.reduce_window"(%29#0, %1) ({ ^bb0(%arg104: tensor, %arg105: tensor): - %198 = mhlo.maximum %arg104, %arg105 : tensor - mhlo.return %198 : tensor + %199 = mhlo.maximum %arg104, %arg105 : tensor + mhlo.return %199 : tensor }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<4x64x112x112xf16>, tensor) -> tensor<4x64x56x56xf16> - %31 = mhlo.convolution(%30, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %32 = call @BatchNormTrainingOp25(%31, %arg8, %arg9) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> - %33:2 = call @Unknown26(%32) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - %34 = mhlo.convolution(%33#0, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %31 = mhlo.convolution(%30, %6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %32 = call @BatchNormTrainingOp27(%31, %arg8, %arg9) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> + %33:2 = call @Unknown28(%32) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %34 = mhlo.convolution(%33#0, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> %35 = call @BatchNormTrainingOp27(%34, %arg13, %arg14) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> - %36:2 = call @Unknown28(%35, %30) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - %37 = mhlo.convolution(%36#0, %9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %38 = call @BatchNormTrainingOp29(%37, %arg18, %arg19) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> - %39:2 = call @Unknown30(%38) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - %40 = mhlo.convolution(%39#0, %10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %41 = call @BatchNormTrainingOp31(%40, %arg23, %arg24) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> - %42:2 = call @Unknown32(%41, %36#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - %43 = mhlo.convolution(%42#0, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<4x128x28x28xf16> - %44 = call @BatchNormTrainingOp33(%43, %arg38, %arg39) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> - %45 = mhlo.convolution(%42#0, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<4x128x28x28xf16> - %46 = call @BatchNormTrainingOp34(%45, %arg28, %arg29) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> - %47:2 = call @Unknown35(%46) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - %48 = mhlo.convolution(%47#0, %13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %49 = call @BatchNormTrainingOp36(%48, %arg33, %arg34) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> - %50:2 = call @Unknown37(%49, %44) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - %51 = mhlo.convolution(%50#0, %14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %52 = call @BatchNormTrainingOp38(%51, %arg43, %arg44) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> - %53:2 = call @Unknown39(%52) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - %54 = mhlo.convolution(%53#0, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %55 = call @BatchNormTrainingOp40(%54, %arg48, %arg49) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> - %56:2 = call @Unknown41(%55, %50#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - %57 = mhlo.convolution(%56#0, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<4x256x14x14xf16> - %58 = call @BatchNormTrainingOp42(%57, %arg63, %arg64) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> - %59 = mhlo.convolution(%56#0, %17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<4x256x14x14xf16> - %60 = call @BatchNormTrainingOp43(%59, %arg53, %arg54) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> - %61:2 = call @Unknown44(%60) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - %62 = mhlo.convolution(%61#0, %18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %63 = call @BatchNormTrainingOp45(%62, %arg58, %arg59) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> - %64:2 = call @Unknown46(%63, %58) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - %65 = mhlo.convolution(%64#0, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %66 = call @BatchNormTrainingOp47(%65, %arg68, %arg69) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> - %67:2 = call @Unknown48(%66) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - %68 = mhlo.convolution(%67#0, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %69 = call @BatchNormTrainingOp49(%68, %arg73, %arg74) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> - %70:2 = call @Unknown50(%69, %64#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - %71 = mhlo.convolution(%70#0, %21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<4x512x7x7xf16> - %72 = call @BatchNormTrainingOp51(%71, %arg88, %arg89) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> - %73 = mhlo.convolution(%70#0, %22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<4x512x7x7xf16> - %74 = call @BatchNormTrainingOp52(%73, %arg78, %arg79) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> - %75:2 = call @Unknown53(%74) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - %76 = mhlo.convolution(%75#0, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %77 = call @BatchNormTrainingOp54(%76, %arg83, %arg84) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> - %78:2 = call @Unknown55(%77, %72) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - %79 = mhlo.convolution(%78#0, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %80 = call @BatchNormTrainingOp56(%79, %arg93, %arg94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> - %81:2 = call @Unknown57(%80) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - %82 = mhlo.convolution(%81#0, %25) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %83 = call @BatchNormTrainingOp58(%82, %arg98, %arg99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> - %84:2 = call @Unknown59(%83, %78#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - %85 = mhlo.reduce(%84#0 init: %1) across dimensions = [3, 2] : (tensor<4x512x7x7xf16>, tensor) -> tensor<4x512xf16> - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %86 = call @Unknown60(%85) : (tensor<4x512xf16>) -> tensor<4x512xf16> - %87 = "mhlo.dot_general"(%86, %27) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<4x512xf16>, tensor<1000x512xf16>) -> tensor<4x1000xf16> - %88 = call @Unknown61(%arg103, %87) : (tensor<1000xf32>, tensor<4x1000xf16>) -> tensor<4x1000xf16> - %89 = mhlo.reduce(%88 init: %2) across dimensions = [1] : (tensor<4x1000xf16>, tensor) -> tensor<4xf16> - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.maximum %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %90:2 = call @Unknown62(%89, %88) : (tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) - %91 = mhlo.reduce(%90#1 init: %1) across dimensions = [1] : (tensor<4x1000xf16>, tensor) -> tensor<4xf16> - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %92:3 = call @Unknown63(%91, %90#0, %28, %26, %arg1) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>, tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) - %93 = "mhlo.dot"(%92#0, %27) {precision_config = [#mhlo, #mhlo]} : (tensor<4x1000xf16>, tensor<1000x512xf16>) -> tensor<4x512xf16> - %94 = call @Unknown64(%93, %84#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> - %95:3 = call @BatchNormGradOp65(%82, %arg98, %94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %96 = call @ConvBackwardDataOp66(%95#0, %25) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %97 = call @ConvBackwardFilterOp67(%81#0, %95#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %98 = call @Unknown68(%81#1, %96) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> - %99:3 = call @BatchNormGradOp69(%79, %arg93, %98) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %100 = call @ConvBackwardDataOp70(%99#0, %24) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %101 = call @ConvBackwardFilterOp71(%78#0, %99#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %102 = call @Unknown72(%94, %100, %78#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> - %103:3 = call @BatchNormGradOp73(%76, %arg83, %102) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %104 = call @ConvBackwardDataOp74(%103#0, %23) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %105 = call @ConvBackwardFilterOp75(%75#0, %103#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %106 = call @Unknown76(%75#1, %104) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> - %107:3 = call @BatchNormGradOp77(%73, %arg78, %106) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %108 = call @ConvBackwardDataOp78(%107#0, %22) : (tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %109 = call @ConvBackwardFilterOp79(%70#0, %107#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> - %110:3 = call @BatchNormGradOp80(%71, %arg88, %102) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %111 = call @ConvBackwardDataOp81(%110#0, %21) : (tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> - %112 = call @ConvBackwardFilterOp82(%70#0, %110#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> - %113 = call @Unknown83(%111, %108, %70#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> - %114:3 = call @BatchNormGradOp84(%68, %arg73, %113) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %115 = call @ConvBackwardDataOp85(%114#0, %20) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %116 = call @ConvBackwardFilterOp86(%67#0, %114#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %117 = call @Unknown87(%67#1, %115) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> - %118:3 = call @BatchNormGradOp88(%65, %arg68, %117) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %119 = call @ConvBackwardDataOp89(%118#0, %19) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %120 = call @ConvBackwardFilterOp90(%64#0, %118#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %121 = call @Unknown91(%113, %119, %64#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> - %122:3 = call @BatchNormGradOp92(%62, %arg58, %121) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %123 = call @ConvBackwardDataOp93(%122#0, %18) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %124 = call @ConvBackwardFilterOp94(%61#0, %122#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %125 = call @Unknown95(%61#1, %123) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> - %126:3 = call @BatchNormGradOp96(%59, %arg53, %125) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %127 = call @ConvBackwardDataOp97(%126#0, %17) : (tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %128 = call @ConvBackwardFilterOp98(%56#0, %126#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> - %129:3 = call @BatchNormGradOp99(%57, %arg63, %121) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %130 = call @ConvBackwardDataOp100(%129#0, %16) : (tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> - %131 = call @ConvBackwardFilterOp101(%56#0, %129#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> - %132 = call @Unknown102(%130, %127, %56#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> - %133:3 = call @BatchNormGradOp103(%54, %arg48, %132) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %134 = call @ConvBackwardDataOp104(%133#0, %15) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %135 = call @ConvBackwardFilterOp105(%53#0, %133#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %136 = call @Unknown106(%53#1, %134) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> - %137:3 = call @BatchNormGradOp107(%51, %arg43, %136) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %138 = call @ConvBackwardDataOp108(%137#0, %14) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %139 = call @ConvBackwardFilterOp109(%50#0, %137#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %140 = call @Unknown110(%132, %138, %50#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> - %141:3 = call @BatchNormGradOp111(%48, %arg33, %140) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %142 = call @ConvBackwardDataOp112(%141#0, %13) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %143 = call @ConvBackwardFilterOp113(%47#0, %141#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %144 = call @Unknown114(%47#1, %142) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> - %145:3 = call @BatchNormGradOp115(%45, %arg28, %144) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %146 = call @ConvBackwardDataOp116(%145#0, %12) : (tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %147 = call @ConvBackwardFilterOp117(%42#0, %145#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> - %148:3 = call @BatchNormGradOp118(%43, %arg38, %140) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %149 = call @ConvBackwardDataOp119(%148#0, %11) : (tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> - %150 = call @ConvBackwardFilterOp120(%42#0, %148#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> - %151 = call @Unknown121(%149, %146, %42#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> - %152:3 = call @BatchNormGradOp122(%40, %arg23, %151) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %153 = call @ConvBackwardDataOp123(%152#0, %10) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %154 = call @ConvBackwardFilterOp124(%39#0, %152#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %155 = call @Unknown125(%39#1, %153) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> - %156:3 = call @BatchNormGradOp126(%37, %arg18, %155) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %157 = call @ConvBackwardDataOp127(%156#0, %9) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %158 = call @ConvBackwardFilterOp128(%36#0, %156#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %159 = call @Unknown129(%151, %157, %36#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> - %160:3 = call @BatchNormGradOp130(%34, %arg13, %159) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %161 = call @ConvBackwardDataOp131(%160#0, %8) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %162 = call @ConvBackwardFilterOp132(%33#0, %160#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %163 = call @Unknown133(%33#1, %161) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> - %164:3 = call @BatchNormGradOp134(%31, %arg8, %163) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %165 = call @ConvBackwardDataOp135(%164#0, %7) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %166 = call @ConvBackwardFilterOp136(%30, %164#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %167 = call @Unknown137(%159, %165) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> - %168 = "mhlo.select_and_scatter"(%29#0, %167, %1) ({ + %36:2 = call @Unknown30(%35, %30) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %37 = mhlo.convolution(%36#0, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %38 = call @BatchNormTrainingOp27(%37, %arg18, %arg19) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> + %39:2 = call @Unknown28(%38) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %40 = mhlo.convolution(%39#0, %9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %41 = call @BatchNormTrainingOp27(%40, %arg23, %arg24) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> + %42:2 = call @Unknown30(%41, %36#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %43 = mhlo.convolution(%42#0, %10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<4x128x28x28xf16> + %44 = call @BatchNormTrainingOp35(%43, %arg38, %arg39) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> + %45 = mhlo.convolution(%42#0, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<4x128x28x28xf16> + %46 = call @BatchNormTrainingOp35(%45, %arg28, %arg29) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> + %47:2 = call @Unknown37(%46) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %48 = mhlo.convolution(%47#0, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %49 = call @BatchNormTrainingOp35(%48, %arg33, %arg34) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> + %50:2 = call @Unknown39(%49, %44) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %51 = mhlo.convolution(%50#0, %13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %52 = call @BatchNormTrainingOp35(%51, %arg43, %arg44) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> + %53:2 = call @Unknown37(%52) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %54 = mhlo.convolution(%53#0, %14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %55 = call @BatchNormTrainingOp35(%54, %arg48, %arg49) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> + %56:2 = call @Unknown39(%55, %50#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %57 = mhlo.convolution(%56#0, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<4x256x14x14xf16> + %58 = call @BatchNormTrainingOp44(%57, %arg63, %arg64) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> + %59 = mhlo.convolution(%56#0, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<4x256x14x14xf16> + %60 = call @BatchNormTrainingOp44(%59, %arg53, %arg54) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> + %61:2 = call @Unknown46(%60) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %62 = mhlo.convolution(%61#0, %17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %63 = call @BatchNormTrainingOp44(%62, %arg58, %arg59) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> + %64:2 = call @Unknown48(%63, %58) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %65 = mhlo.convolution(%64#0, %18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %66 = call @BatchNormTrainingOp44(%65, %arg68, %arg69) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> + %67:2 = call @Unknown46(%66) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %68 = mhlo.convolution(%67#0, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %69 = call @BatchNormTrainingOp44(%68, %arg73, %arg74) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> + %70:2 = call @Unknown48(%69, %64#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %71 = mhlo.convolution(%70#0, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<4x512x7x7xf16> + %72 = call @BatchNormTrainingOp53(%71, %arg88, %arg89) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> + %73 = mhlo.convolution(%70#0, %21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<4x512x7x7xf16> + %74 = call @BatchNormTrainingOp53(%73, %arg78, %arg79) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> + %75:2 = call @Unknown55(%74) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %76 = mhlo.convolution(%75#0, %22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %77 = call @BatchNormTrainingOp53(%76, %arg83, %arg84) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> + %78:2 = call @Unknown57(%77, %72) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %79 = mhlo.convolution(%78#0, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %80 = call @BatchNormTrainingOp53(%79, %arg93, %arg94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> + %81:2 = call @Unknown55(%80) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %82 = mhlo.convolution(%81#0, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %83 = call @BatchNormTrainingOp53(%82, %arg98, %arg99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> + %84:2 = call @Unknown57(%83, %78#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %85 = call @Unknown62(%84#0) : (tensor<4x512x7x7xf16>) -> tensor<4x512xf16> + %86 = call @Unknown63(%85) : (tensor<4x512xf16>) -> tensor<4x512xf16> + %87 = "mhlo.dot_general"(%86, %26) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<4x512xf16>, tensor<1000x512xf16>) -> tensor<4x1000xf16> + %88 = call @Unknown64(%27, %87) : (tensor<1000xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16> + %89 = call @Unknown65(%88) : (tensor<4x1000xf16>) -> tensor<4xf16> + %90 = call @Unknown66(%89, %88) : (tensor<4xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16> + %91 = call @Unknown67(%90) : (tensor<4x1000xf16>) -> tensor<4xf16> + %92 = call @Unknown68(%91) : (tensor<4xf16>) -> tensor<4xf16> + %93:2 = call @Unknown69(%92, %90, %28, %25) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) + %94 = "mhlo.dot"(%93#1, %26) {precision_config = [#mhlo, #mhlo]} : (tensor<4x1000xf16>, tensor<1000x512xf16>) -> tensor<4x512xf16> + %95 = call @Unknown70(%94, %84#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> + %96:3 = call @BatchNormGradOp71(%82, %arg98, %95) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %97 = call @ConvBackwardDataOp72(%96#0, %24) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %98 = call @ConvBackwardFilterOp73(%81#0, %96#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %99 = call @Unknown74(%81#1, %97) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> + %100:3 = call @BatchNormGradOp71(%79, %arg93, %99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %101 = call @ConvBackwardDataOp72(%100#0, %23) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %102 = call @ConvBackwardFilterOp73(%78#0, %100#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %103 = call @Unknown78(%95, %101, %78#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> + %104:3 = call @BatchNormGradOp71(%76, %arg83, %103) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %105 = call @ConvBackwardDataOp72(%104#0, %22) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %106 = call @ConvBackwardFilterOp73(%75#0, %104#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %107 = call @Unknown74(%75#1, %105) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> + %108:3 = call @BatchNormGradOp71(%73, %arg78, %107) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %109 = call @ConvBackwardDataOp84(%108#0, %21) : (tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %110 = call @ConvBackwardFilterOp85(%70#0, %108#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> + %111:3 = call @BatchNormGradOp71(%71, %arg88, %103) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %112 = call @ConvBackwardDataOp87(%111#0, %20) : (tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> + %113 = call @ConvBackwardFilterOp88(%70#0, %111#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> + %114 = call @Unknown89(%112, %109, %70#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> + %115:3 = call @BatchNormGradOp90(%68, %arg73, %114) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %116 = call @ConvBackwardDataOp91(%115#0, %19) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %117 = call @ConvBackwardFilterOp92(%67#0, %115#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %118 = call @Unknown93(%67#1, %116) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> + %119:3 = call @BatchNormGradOp90(%65, %arg68, %118) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %120 = call @ConvBackwardDataOp91(%119#0, %18) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %121 = call @ConvBackwardFilterOp92(%64#0, %119#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %122 = call @Unknown89(%114, %120, %64#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> + %123:3 = call @BatchNormGradOp90(%62, %arg58, %122) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %124 = call @ConvBackwardDataOp91(%123#0, %17) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %125 = call @ConvBackwardFilterOp92(%61#0, %123#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %126 = call @Unknown93(%61#1, %124) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> + %127:3 = call @BatchNormGradOp90(%59, %arg53, %126) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %128 = call @ConvBackwardDataOp103(%127#0, %16) : (tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %129 = call @ConvBackwardFilterOp104(%56#0, %127#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> + %130:3 = call @BatchNormGradOp90(%57, %arg63, %122) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %131 = call @ConvBackwardDataOp106(%130#0, %15) : (tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> + %132 = call @ConvBackwardFilterOp107(%56#0, %130#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> + %133 = call @Unknown108(%131, %128, %56#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> + %134:3 = call @BatchNormGradOp109(%54, %arg48, %133) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %135 = call @ConvBackwardDataOp110(%134#0, %14) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %136 = call @ConvBackwardFilterOp111(%53#0, %134#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %137 = call @Unknown112(%53#1, %135) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> + %138:3 = call @BatchNormGradOp109(%51, %arg43, %137) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %139 = call @ConvBackwardDataOp110(%138#0, %13) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %140 = call @ConvBackwardFilterOp111(%50#0, %138#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %141 = call @Unknown108(%133, %139, %50#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> + %142:3 = call @BatchNormGradOp109(%48, %arg33, %141) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %143 = call @ConvBackwardDataOp110(%142#0, %12) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %144 = call @ConvBackwardFilterOp111(%47#0, %142#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %145 = call @Unknown112(%47#1, %143) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> + %146:3 = call @BatchNormGradOp109(%45, %arg28, %145) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %147 = call @ConvBackwardDataOp122(%146#0, %11) : (tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %148 = call @ConvBackwardFilterOp123(%42#0, %146#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> + %149:3 = call @BatchNormGradOp109(%43, %arg38, %141) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %150 = call @ConvBackwardDataOp125(%149#0, %10) : (tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> + %151 = call @ConvBackwardFilterOp126(%42#0, %149#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> + %152 = call @Unknown127(%150, %147, %42#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> + %153:3 = call @BatchNormGradOp128(%40, %arg23, %152) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %154 = call @ConvBackwardDataOp129(%153#0, %9) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %155 = call @ConvBackwardFilterOp130(%39#0, %153#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %156 = call @Unknown131(%39#1, %154) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> + %157:3 = call @BatchNormGradOp128(%37, %arg18, %156) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %158 = call @ConvBackwardDataOp129(%157#0, %8) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %159 = call @ConvBackwardFilterOp130(%36#0, %157#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %160 = call @Unknown127(%152, %158, %36#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> + %161:3 = call @BatchNormGradOp128(%34, %arg13, %160) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %162 = call @ConvBackwardDataOp129(%161#0, %7) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %163 = call @ConvBackwardFilterOp130(%33#0, %161#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %164 = call @Unknown131(%33#1, %162) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> + %165:3 = call @BatchNormGradOp128(%31, %arg8, %164) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %166 = call @ConvBackwardDataOp129(%165#0, %6) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %167 = call @ConvBackwardFilterOp130(%30, %165#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %168 = call @Unknown143(%160, %166) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> + %169 = "mhlo.select_and_scatter"(%29#0, %168, %0) ({ ^bb0(%arg104: tensor, %arg105: tensor): - %198 = mhlo.compare GE, %arg104, %arg105 : (tensor, tensor) -> tensor - mhlo.return %198 : tensor + %199 = mhlo.compare GE, %arg104, %arg105 : (tensor, tensor) -> tensor + mhlo.return %199 : tensor }, { ^bb0(%arg104: tensor, %arg105: tensor): - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor + %199 = mhlo.add %arg104, %arg105 : tensor + mhlo.return %199 : tensor }) {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<4x64x112x112xf16>, tensor<4x64x56x56xf16>, tensor) -> tensor<4x64x112x112xf16> - %169 = call @Unknown138(%29#1, %168) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> - %170:3 = call @BatchNormGradOp139(%5, %arg3, %169) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) - %171 = call @ConvBackwardFilterOp140(%3, %170#0) : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> - %172 = mhlo.reduce(%92#1 init: %0) across dimensions = [0, 1] : (tensor<4x1000xf32>, tensor) -> tensor - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %173 = call @Unknown141(%172) : (tensor) -> tensor - %174 = call @Unknown142(%171) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> - %175 = call @Unknown143(%166) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %176 = call @Unknown144(%162) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %177 = call @Unknown145(%158) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %178 = call @Unknown146(%154) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %179 = call @Unknown147(%147) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> - %180 = call @Unknown148(%143) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %181 = call @Unknown149(%150) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> - %182 = call @Unknown150(%139) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %183 = call @Unknown151(%135) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %184 = call @Unknown152(%128) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> - %185 = call @Unknown153(%124) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %186 = call @Unknown154(%131) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> - %187 = call @Unknown155(%120) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %188 = call @Unknown156(%116) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %189 = call @Unknown157(%109) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> - %190 = call @Unknown158(%105) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %191 = call @Unknown159(%112) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> - %192 = call @Unknown160(%101) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %193 = call @Unknown161(%97) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %194 = call @MatmulOp162(%86, %92#0) : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<1000x512xf16> - %195 = call @Unknown163(%194) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> - %196 = mhlo.reduce(%92#2 init: %0) across dimensions = [0] : (tensor<4x1000xf32>, tensor) -> tensor<1000xf32> - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %197 = call @Unknown164(%196) : (tensor<1000xf32>) -> tensor<1000xf32> - return %173, %174, %170#1, %170#2, %175, %164#1, %164#2, %176, %160#1, %160#2, %177, %156#1, %156#2, %178, %152#1, %152#2, %179, %145#1, %145#2, %180, %141#1, %141#2, %181, %148#1, %148#2, %182, %137#1, %137#2, %183, %133#1, %133#2, %184, %126#1, %126#2, %185, %122#1, %122#2, %186, %129#1, %129#2, %187, %118#1, %118#2, %188, %114#1, %114#2, %189, %107#1, %107#2, %190, %103#1, %103#2, %191, %110#1, %110#2, %192, %99#1, %99#2, %193, %95#1, %95#2, %195, %197 : tensor, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32> + %170 = call @Unknown144(%29#1, %169) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> + %171:3 = call @BatchNormGradOp145(%4, %arg3, %170) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) + %172 = call @ConvBackwardFilterOp146(%2, %171#0) : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> + %173 = call @Unknown147(%93#0, %arg1) : (tensor<4x1000xf16>, tensor<4x1000xf32>) -> tensor + %174 = call @Unknown148(%173) : (tensor) -> tensor + %175 = call @Unknown149(%172) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> + %176 = call @Unknown150(%167) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %177 = call @Unknown150(%163) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %178 = call @Unknown150(%159) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %179 = call @Unknown150(%155) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %180 = call @Unknown154(%148) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> + %181 = call @Unknown155(%144) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %182 = call @Unknown156(%151) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> + %183 = call @Unknown155(%140) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %184 = call @Unknown155(%136) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %185 = call @Unknown159(%129) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> + %186 = call @Unknown160(%125) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %187 = call @Unknown161(%132) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> + %188 = call @Unknown160(%121) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %189 = call @Unknown160(%117) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %190 = call @Unknown164(%110) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> + %191 = call @Unknown165(%106) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %192 = call @Unknown166(%113) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> + %193 = call @Unknown165(%102) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %194 = call @Unknown165(%98) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %195 = call @MatmulOp169(%86, %93#1) : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<1000x512xf16> + %196 = call @Unknown170(%195) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> + %197 = call @Unknown171(%93#1) : (tensor<4x1000xf16>) -> tensor<1000xf32> + %198 = call @Unknown172(%197) : (tensor<1000xf32>) -> tensor<1000xf32> + return %174, %175, %171#1, %171#2, %176, %165#1, %165#2, %177, %161#1, %161#2, %178, %157#1, %157#2, %179, %153#1, %153#2, %180, %146#1, %146#2, %181, %142#1, %142#2, %182, %149#1, %149#2, %183, %138#1, %138#2, %184, %134#1, %134#2, %185, %127#1, %127#2, %186, %123#1, %123#2, %187, %130#1, %130#2, %188, %119#1, %119#2, %189, %115#1, %115#2, %190, %108#1, %108#2, %191, %104#1, %104#2, %192, %111#1, %111#2, %193, %100#1, %100#2, %194, %96#1, %96#2, %196, %198 : tensor, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/Whole/3_byre_tensor_opt.mlir b/compiler/test/E2E/ResNet18/Whole/3_byre_tensor_opt.mlir index d7013596c..527c5f7c3 100644 --- a/compiler/test/E2E/ResNet18/Whole/3_byre_tensor_opt.mlir +++ b/compiler/test/E2E/ResNet18/Whole/3_byre_tensor_opt.mlir @@ -2,30 +2,78 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1) -> (d0, d1)> -#map2 = affine_map<(d0, d1) -> (d1)> -#map3 = affine_map<(d0, d1) -> (d0)> -#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -#map5 = affine_map<() -> ()> -#map6 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> +#map1 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024, 1000)> +#map2 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024 + 2, 1000)> +#map3 = affine_map<(d0, d1) -> (d0 - d1)> +#map4 = affine_map<(d0) -> (d0 * 2)> +#map5 = affine_map<(d0) -> (d0 * 2 + 1)> +#map6 = affine_map<(d0) -> (d0 mod 64, 49)> +#map7 = affine_map<(d0) -> (d0 mod 64 + 1, 49)> +#map8 = affine_map<(d0) -> (d0 mod 128, 125)> +#map9 = affine_map<(d0) -> (d0 mod 128 + 1, 125)> +#map10 = affine_map<(d0) -> (d0 * 32)> +#map11 = affine_map<(d0) -> (d0 * -32 + 1000, 32)> +#map12 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0)> +#map13 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0 + 1)> +#map14 = affine_map<(d0)[s0] -> (d0 * 32 + s0)> module @IrToMhlo.2452 { func.func private @Unknown0(%arg0: tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16> attributes {__byteir_elementwise_fusion__} { + %c224 = arith.constant 224 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<4x3x224x224xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x3x224x224xf32>) outs(%0 : tensor<4x3x224x224xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<4x3x224x224xf16> + %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x3x224x224xf16>) { + %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x3x224x224xf16>) { + %3 = scf.for %arg5 = %c0 to %c224 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x3x224x224xf16>) { + %4 = scf.for %arg7 = %c0 to %c224 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x3x224x224xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x3x224x224xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x3x224x224xf16> + scf.yield %inserted_slice : tensor<4x3x224x224xf16> + } + scf.yield %4 : tensor<4x3x224x224xf16> + } + scf.yield %3 : tensor<4x3x224x224xf16> + } + scf.yield %2 : tensor<4x3x224x224xf16> + } return %1 : tensor<4x3x224x224xf16> } func.func private @Unknown1(%arg0: tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x3x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf32>) outs(%0 : tensor<64x3x7x7xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x3x7x7xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf16>) { + %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf16>) { + %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf16>) { + %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x3x7x7xf16> + scf.yield %inserted_slice : tensor<64x3x7x7xf16> + } + scf.yield %4 : tensor<64x3x7x7xf16> + } + scf.yield %3 : tensor<64x3x7x7xf16> + } + scf.yield %2 : tensor<64x3x7x7xf16> + } return %1 : tensor<64x3x7x7xf16> } func.func private @BatchNormTrainingOp2(%arg0: tensor<4x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x112x112xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { @@ -35,611 +83,1420 @@ module @IrToMhlo.2452 { return %1 : tensor<4x64x112x112xf16> } func.func private @Unknown3(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown4(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown5(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown6(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x64x3x3xf16> + scf.yield %inserted_slice : tensor<64x64x3x3xf16> + } + scf.yield %4 : tensor<64x64x3x3xf16> + } + scf.yield %3 : tensor<64x64x3x3xf16> + } + scf.yield %2 : tensor<64x64x3x3xf16> + } return %1 : tensor<64x64x3x3xf16> } func.func private @Unknown7(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf32>) outs(%0 : tensor<128x64x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x64x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x1x1xf16> + scf.yield %inserted_slice : tensor<128x64x1x1xf16> + } + scf.yield %2 : tensor<128x64x1x1xf16> + } return %1 : tensor<128x64x1x1xf16> } func.func private @Unknown8(%arg0: tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf32>) outs(%0 : tensor<128x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x64x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x3x3xf16> + scf.yield %inserted_slice : tensor<128x64x3x3xf16> + } + scf.yield %4 : tensor<128x64x3x3xf16> + } + scf.yield %3 : tensor<128x64x3x3xf16> + } + scf.yield %2 : tensor<128x64x3x3xf16> + } return %1 : tensor<128x64x3x3xf16> } func.func private @Unknown9(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown10(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown11(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x128x3x3xf16> + scf.yield %inserted_slice : tensor<128x128x3x3xf16> + } + scf.yield %4 : tensor<128x128x3x3xf16> + } + scf.yield %3 : tensor<128x128x3x3xf16> + } + scf.yield %2 : tensor<128x128x3x3xf16> + } return %1 : tensor<128x128x3x3xf16> } func.func private @Unknown12(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf32>) outs(%0 : tensor<256x128x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x128x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x1x1xf16> + scf.yield %inserted_slice : tensor<256x128x1x1xf16> + } + scf.yield %2 : tensor<256x128x1x1xf16> + } return %1 : tensor<256x128x1x1xf16> } func.func private @Unknown13(%arg0: tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf32>) outs(%0 : tensor<256x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x128x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x3x3xf16> + scf.yield %inserted_slice : tensor<256x128x3x3xf16> + } + scf.yield %4 : tensor<256x128x3x3xf16> + } + scf.yield %3 : tensor<256x128x3x3xf16> + } + scf.yield %2 : tensor<256x128x3x3xf16> + } return %1 : tensor<256x128x3x3xf16> } func.func private @Unknown14(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown15(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown16(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x256x3x3xf16> + scf.yield %inserted_slice : tensor<256x256x3x3xf16> + } + scf.yield %4 : tensor<256x256x3x3xf16> + } + scf.yield %3 : tensor<256x256x3x3xf16> + } + scf.yield %2 : tensor<256x256x3x3xf16> + } return %1 : tensor<256x256x3x3xf16> } func.func private @Unknown17(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf32>) outs(%0 : tensor<512x256x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x256x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x1x1xf16> + scf.yield %inserted_slice : tensor<512x256x1x1xf16> + } + scf.yield %2 : tensor<512x256x1x1xf16> + } return %1 : tensor<512x256x1x1xf16> } func.func private @Unknown18(%arg0: tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf32>) outs(%0 : tensor<512x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x256x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x3x3xf16> + scf.yield %inserted_slice : tensor<512x256x3x3xf16> + } + scf.yield %4 : tensor<512x256x3x3xf16> + } + scf.yield %3 : tensor<512x256x3x3xf16> + } + scf.yield %2 : tensor<512x256x3x3xf16> + } return %1 : tensor<512x256x3x3xf16> } func.func private @Unknown19(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown20(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown21(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x512x3x3xf16> + scf.yield %inserted_slice : tensor<512x512x3x3xf16> + } + scf.yield %4 : tensor<512x512x3x3xf16> + } + scf.yield %3 : tensor<512x512x3x3xf16> + } + scf.yield %2 : tensor<512x512x3x3xf16> + } return %1 : tensor<512x512x3x3xf16> } func.func private @Unknown22(%arg0: tensor<4x1000xf32>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant -2.500000e-01 : f32 %0 = tensor.empty() : tensor<4x1000xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<4x1000xf32>) outs(%0 : tensor<4x1000xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.mulf %in, %cst : f32 - %3 = arith.truncf %2 : f32 to f16 - linalg.yield %3 : f16 - } -> tensor<4x1000xf16> + %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x1000xf16>) { + %2 = scf.for %arg3 = %c0 to %c1000 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x1000xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<4x1000xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.mulf %in, %cst : f32 + %6 = arith.truncf %5 : f32 to f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<4x1000xf16> + scf.yield %inserted_slice : tensor<4x1000xf16> + } + scf.yield %2 : tensor<4x1000xf16> + } return %1 : tensor<4x1000xf16> } func.func private @Unknown23(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1000x512xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf32>) outs(%0 : tensor<1000x512xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<1000x512xf16> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf16>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<1000x512xf16> + scf.yield %inserted_slice : tensor<1000x512xf16> + } + scf.yield %2 : tensor<1000x512xf16> + } return %1 : tensor<1000x512xf16> } - func.func private @Unknown24(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown24(%arg0: tensor<1000xf32>) -> tensor<1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index + %0 = tensor.empty() : tensor<1000xf16> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<1000xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %out: f16): + %4 = arith.truncf %in : f32 to f16 + linalg.yield %4 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor into tensor<1000xf16> + scf.yield %inserted_slice : tensor<1000xf16> + } + return %1 : tensor<1000xf16> + } + func.func private @Unknown25(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = tensor.empty() : tensor<4xf16> + %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16> + %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16> + %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor + %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<512xf16> + %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) { + %21 = affine.min #map1(%arg3) + %22 = affine.min #map2(%arg3) + %23 = affine.apply #map3(%22, %21) + %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor + %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor into tensor<1x?xf16> + %dim = tensor.dim %expanded_10, %c1 : tensor<1x?xf16> + %24 = arith.cmpi ugt, %dim, %c0 : index + %25 = scf.if %24 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %26 = arith.addf %25, %cst : f16 + %dim_11 = tensor.dim %expanded_10, %c1 : tensor<1x?xf16> + %27 = arith.cmpi ugt, %dim_11, %c1 : index + %28 = scf.if %27 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %29 = arith.addf %26, %28 : f16 + %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor + %inserted = tensor.insert %29 into %extracted_slice_12[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<512xf16> + } + } {mapping = [#gpu.thread]} + %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16> + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<256xf16> + %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) { + %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<256xf16> + } + } {mapping = [#gpu.thread]} + %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<128xf16> + %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) { + %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<128xf16> + } + } {mapping = [#gpu.thread]} + %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf16> + %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) { + %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<64xf16> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf16> + %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) { + %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf16> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf16> + %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) { + %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<16xf16> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16> + %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf16> + %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) { + %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<8xf16> + } + } {mapping = [#gpu.thread]} + %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16> + %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf16> + %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) { + %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.thread]} + %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16> + %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf16> + %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) { + %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<2xf16> + } + } {mapping = [#gpu.thread]} + %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor) { + %21 = affine.apply #map4(%arg3) + %extracted = tensor.extract %19[%21] : tensor<2xf16> + %22 = arith.addf %extracted, %cst : f16 + %23 = affine.apply #map5(%arg3) + %extracted_9 = tensor.extract %19[%23] : tensor<2xf16> + %24 = arith.addf %extracted_9, %22 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[] [] [] : tensor to tensor + %inserted = tensor.insert %24 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.block]} + return %1 : tensor<4xf16> + } + func.func private @Unknown26(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { + %c112 = arith.constant 112 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x112x112xf16> %1 = tensor.empty() : tensor<4x64x112x112xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x112x112xf16>) outs(%0, %1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) + %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) { + %3:2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) { + %4:2 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) { + %5:2 = scf.for %arg10 = %c0 to %c112 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_1: i1): + %9 = arith.maximumf %in, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + linalg.yield %9, %10 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x112x112xf16> + %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x112x112xi1> + scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> + } + scf.yield %5#0, %5#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> + } + scf.yield %4#0, %4#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> + } + scf.yield %3#0, %3#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> + } return %2#0, %2#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> } - func.func private @BatchNormTrainingOp25(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @Unknown26(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = tensor.empty() : tensor<4x64x56x56xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> - } func.func private @BatchNormTrainingOp27(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> return %1 : tensor<4x64x56x56xf16> } - func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = tensor.empty() : tensor<4x64x56x56xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> - } - func.func private @BatchNormTrainingOp29(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x56x56xf16> %1 = tensor.empty() : tensor<4x64x56x56xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %3:2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %4:2 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %5:2 = scf.for %arg10 = %c0 to %c56 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_1: i1): + %9 = arith.maximumf %in, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + linalg.yield %9, %10 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xf16> + %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xi1> + scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %5#0, %5#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %4#0, %4#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %3#0, %3#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> } - func.func private @BatchNormTrainingOp31(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @Unknown32(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x56x56xf16> %1 = tensor.empty() : tensor<4x64x56x56xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %3:2 = scf.for %arg5 = %c0 to %c64 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %4:2 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %5:2 = scf.for %arg11 = %c0 to %c56 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1): + %9 = arith.addf %in, %in_2 : f16 + %10 = arith.maximumf %9, %cst : f16 + %11 = arith.cmpf ogt, %10, %cst : f16 + linalg.yield %10, %11 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xf16> + %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xi1> + scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %5#0, %5#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %4#0, %4#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %3#0, %3#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> } - func.func private @BatchNormTrainingOp33(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { + func.func private @BatchNormTrainingOp35(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> return %1 : tensor<4x128x28x28xf16> } - func.func private @BatchNormTrainingOp34(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @Unknown35(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x128x28x28xf16> %1 = tensor.empty() : tensor<4x128x28x28xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> - } - func.func private @BatchNormTrainingOp36(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = tensor.empty() : tensor<4x128x28x28xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> - } - func.func private @BatchNormTrainingOp38(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = tensor.empty() : tensor<4x128x28x28xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %3:2 = scf.for %arg4 = %c0 to %c128 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %4:2 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %5:2 = scf.for %arg10 = %c0 to %c28 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_1: i1): + %9 = arith.maximumf %in, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + linalg.yield %9, %10 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xf16> + %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xi1> + scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %5#0, %5#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %4#0, %4#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %3#0, %3#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> } - func.func private @BatchNormTrainingOp40(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @Unknown41(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x128x28x28xf16> %1 = tensor.empty() : tensor<4x128x28x28xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %3:2 = scf.for %arg5 = %c0 to %c128 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %4:2 = scf.for %arg8 = %c0 to %c28 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %5:2 = scf.for %arg11 = %c0 to %c28 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1): + %9 = arith.addf %in, %in_2 : f16 + %10 = arith.maximumf %9, %cst : f16 + %11 = arith.cmpf ogt, %10, %cst : f16 + linalg.yield %10, %11 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xf16> + %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xi1> + scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %5#0, %5#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %4#0, %4#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %3#0, %3#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> } - func.func private @BatchNormTrainingOp42(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @BatchNormTrainingOp43(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @Unknown44(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = tensor.empty() : tensor<4x256x14x14xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> - } - func.func private @BatchNormTrainingOp45(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = tensor.empty() : tensor<4x256x14x14xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> - } - func.func private @BatchNormTrainingOp47(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { + func.func private @BatchNormTrainingOp44(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> return %1 : tensor<4x256x14x14xf16> } - func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x256x14x14xf16> %1 = tensor.empty() : tensor<4x256x14x14xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %3:2 = scf.for %arg4 = %c0 to %c256 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %4:2 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %5:2 = scf.for %arg10 = %c0 to %c14 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_1: i1): + %9 = arith.maximumf %in, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + linalg.yield %9, %10 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xf16> + %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xi1> + scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %5#0, %5#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %4#0, %4#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %3#0, %3#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> } - func.func private @BatchNormTrainingOp49(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @Unknown50(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x256x14x14xf16> %1 = tensor.empty() : tensor<4x256x14x14xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %3:2 = scf.for %arg5 = %c0 to %c256 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %4:2 = scf.for %arg8 = %c0 to %c14 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %5:2 = scf.for %arg11 = %c0 to %c14 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1): + %9 = arith.addf %in, %in_2 : f16 + %10 = arith.maximumf %9, %cst : f16 + %11 = arith.cmpf ogt, %10, %cst : f16 + linalg.yield %10, %11 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xf16> + %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xi1> + scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %5#0, %5#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %4#0, %4#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %3#0, %3#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> } - func.func private @BatchNormTrainingOp51(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { + func.func private @BatchNormTrainingOp53(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> return %1 : tensor<4x512x7x7xf16> } - func.func private @BatchNormTrainingOp52(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @Unknown53(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x512x7x7xf16> %1 = tensor.empty() : tensor<4x512x7x7xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> - } - func.func private @BatchNormTrainingOp54(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = tensor.empty() : tensor<4x512x7x7xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %3:2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %4:2 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %5:2 = scf.for %arg10 = %c0 to %c7 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_1: i1): + %9 = arith.maximumf %in, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + linalg.yield %9, %10 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xf16> + %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xi1> + scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %5#0, %5#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %4#0, %4#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %3#0, %3#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> } - func.func private @BatchNormTrainingOp56(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x512x7x7xf16> %1 = tensor.empty() : tensor<4x512x7x7xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %3:2 = scf.for %arg5 = %c0 to %c512 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %4:2 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %5:2 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1): + %9 = arith.addf %in, %in_2 : f16 + %10 = arith.maximumf %9, %cst : f16 + %11 = arith.cmpf ogt, %10, %cst : f16 + linalg.yield %10, %11 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xf16> + %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xi1> + scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %5#0, %5#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %4#0, %4#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %3#0, %3#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> } - func.func private @BatchNormTrainingOp58(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} { - %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @Unknown59(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown62(%arg0: tensor<4x512x7x7xf16>) -> tensor<4x512xf16> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = tensor.empty() : tensor<4x512x7x7xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> - } - func.func private @Unknown60(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} { + %collapsed = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] : tensor<4x512x7x7xf16> into tensor<2048x49xf16> + %0 = tensor.empty() : tensor<2048xf16> + %1 = scf.forall (%arg1) in (2048) shared_outs(%arg2 = %0) -> (tensor<2048xf16>) { + %extracted_slice = tensor.extract_slice %collapsed[%arg1, 0] [1, 49] [1, 1] : tensor<2048x49xf16> to tensor<49xf16> + %expanded_0 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<49xf16> into tensor<1x49xf16> + %extracted_slice_1 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<2048xf16> to tensor + %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf16> + %3 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %2) -> (tensor<64xf16>) { + %15 = affine.min #map6(%arg3) + %16 = affine.min #map7(%arg3) + %17 = affine.apply #map3(%16, %15) + %extracted_slice_7 = tensor.extract_slice %expanded_0[0, %15] [1, %17] [1, 1] : tensor<1x49xf16> to tensor + %expanded_8 = tensor.expand_shape %extracted_slice_7 [[0, 1]] : tensor into tensor<1x?xf16> + %dim = tensor.dim %expanded_8, %c1 : tensor<1x?xf16> + %18 = arith.cmpi ugt, %dim, %c0 : index + %19 = scf.if %18 -> (f16) { + %extracted = tensor.extract %expanded_8[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %20 = arith.addf %19, %cst : f16 + %extracted_slice_9 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor + %inserted = tensor.insert %20 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<64xf16> + } + } {mapping = [#gpu.thread]} + %expanded_2 = tensor.expand_shape %3 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16> + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf16> + %5 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf16>) { + %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<32x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_2[%arg3, %c1] : tensor<32x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf16> + } + } {mapping = [#gpu.thread]} + %expanded_3 = tensor.expand_shape %5 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf16> + %7 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %6) -> (tensor<16xf16>) { + %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<16x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_3[%arg3, %c1] : tensor<16x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<16xf16> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %7 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf16> + %9 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %8) -> (tensor<8xf16>) { + %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<8x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_4[%arg3, %c1] : tensor<8x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<8xf16> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %9 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf16> + %11 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %10) -> (tensor<4xf16>) { + %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<4x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_5[%arg3, %c1] : tensor<4x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %11 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf16> + %13 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %12) -> (tensor<2xf16>) { + %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<2x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_6[%arg3, %c1] : tensor<2x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<2xf16> + } + } {mapping = [#gpu.thread]} + %14 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_1) -> (tensor) { + %15 = affine.apply #map4(%arg3) + %extracted = tensor.extract %13[%15] : tensor<2xf16> + %16 = arith.addf %extracted, %cst : f16 + %17 = affine.apply #map5(%arg3) + %extracted_7 = tensor.extract %13[%17] : tensor<2xf16> + %18 = arith.addf %extracted_7, %16 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[] [] [] : tensor to tensor + %inserted = tensor.insert %18 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %14 into %arg2[%arg1] [1] [1] : tensor into tensor<2048xf16> + } + } {mapping = [#gpu.block]} + %expanded = tensor.expand_shape %1 [[0, 1]] : tensor<2048xf16> into tensor<4x512xf16> + return %expanded : tensor<4x512xf16> + } + func.func private @Unknown63(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 2.040100e-02 : f16 %0 = tensor.empty() : tensor<4x512xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<4x512xf16>) outs(%0 : tensor<4x512xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.mulf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x512xf16> + %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x512xf16>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x512xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<4x512xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f16): + %5 = arith.mulf %in, %cst : f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<4x512xf16> + scf.yield %inserted_slice : tensor<4x512xf16> + } + scf.yield %2 : tensor<4x512xf16> + } return %1 : tensor<4x512xf16> } - func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown64(%arg0: tensor<1000xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<4x1000xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map2, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x1000xf16>, tensor<1000xf32>) outs(%0 : tensor<4x1000xf16>) { - ^bb0(%in: f16, %in_0: f32, %out: f16): - %2 = arith.truncf %in_0 : f32 to f16 - %3 = arith.addf %in, %2 : f16 - linalg.yield %3 : f16 - } -> tensor<4x1000xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x1000xf16>) { + %2 = scf.for %arg4 = %c0 to %c1000 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x1000xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<1000xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %5 = arith.addf %in_1, %in : f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor into tensor<4x1000xf16> + scf.yield %inserted_slice : tensor<4x1000xf16> + } + scf.yield %2 : tensor<4x1000xf16> + } return %1 : tensor<4x1000xf16> } - func.func private @Unknown62(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown65(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = tensor.empty() : tensor<4xf16> + %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16> + %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16> + %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor + %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<512xf16> + %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) { + %21 = affine.min #map1(%arg3) + %22 = affine.min #map2(%arg3) + %23 = affine.apply #map3(%22, %21) + %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor + %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor into tensor<1x?xf16> + %dim = tensor.dim %expanded_10, %c1 : tensor<1x?xf16> + %24 = arith.cmpi ugt, %dim, %c0 : index + %25 = scf.if %24 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %dim_11 = tensor.dim %expanded_10, %c1 : tensor<1x?xf16> + %26 = arith.cmpi ugt, %dim_11, %c1 : index + %27 = scf.if %26 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %28 = arith.maximumf %25, %27 : f16 + %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor + %inserted = tensor.insert %28 into %extracted_slice_12[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<512xf16> + } + } {mapping = [#gpu.thread]} + %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16> + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<256xf16> + %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) { + %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16> + %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<256xf16> + } + } {mapping = [#gpu.thread]} + %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<128xf16> + %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) { + %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16> + %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<128xf16> + } + } {mapping = [#gpu.thread]} + %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf16> + %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) { + %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16> + %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<64xf16> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf16> + %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) { + %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16> + %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf16> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf16> + %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) { + %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16> + %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<16xf16> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16> + %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf16> + %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) { + %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16> + %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<8xf16> + } + } {mapping = [#gpu.thread]} + %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16> + %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf16> + %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) { + %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16> + %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.thread]} + %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16> + %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf16> + %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) { + %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16> + %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<2xf16> + } + } {mapping = [#gpu.thread]} + %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor) { + %21 = affine.apply #map4(%arg3) + %extracted = tensor.extract %19[%21] : tensor<2xf16> + %22 = affine.apply #map5(%arg3) + %extracted_9 = tensor.extract %19[%22] : tensor<2xf16> + %23 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[] [] [] : tensor to tensor + %inserted = tensor.insert %23 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.block]} + return %1 : tensor<4xf16> + } + func.func private @Unknown66(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<4x1000xf16> - %1:2 = linalg.generic {indexing_maps = [#map1, #map3, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x1000xf16>, tensor<4xf16>) outs(%0, %0 : tensor<4x1000xf16>, tensor<4x1000xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: f16): - %2 = arith.subf %in, %in_0 : f16 - %3 = math.exp %2 : f16 - linalg.yield %2, %3 : f16, f16 - } -> (tensor<4x1000xf16>, tensor<4x1000xf16>) - return %1#0, %1#1 : tensor<4x1000xf16>, tensor<4x1000xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x1000xf16>) { + %2 = scf.for %arg4 = %c0 to %c1000 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x1000xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<4xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %5 = arith.subf %in_1, %in : f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor into tensor<4x1000xf16> + scf.yield %inserted_slice : tensor<4x1000xf16> + } + scf.yield %2 : tensor<4x1000xf16> + } + return %1 : tensor<4x1000xf16> } - func.func private @Unknown63(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>, %arg4: tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<4x1000xf16> - %1 = tensor.empty() : tensor<4x1000xf32> - %2:3 = linalg.generic {indexing_maps = [#map1, #map1, #map3, #map3, #map1, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3, %arg1, %arg0, %arg2, %arg4 : tensor<4x1000xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4xf16>, tensor<4x1000xf32>) outs(%0, %1, %1 : tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %in_2: f16, %in_3: f32, %out: f16, %out_4: f32, %out_5: f32): - %3 = math.log %in_1 : f16 - %4 = arith.subf %in_0, %3 : f16 - %5 = math.exp %4 : f16 - %6 = arith.mulf %5, %in_2 : f16 - %7 = arith.subf %in, %6 : f16 - %8 = arith.extf %4 : f16 to f32 - %9 = arith.mulf %8, %in_3 : f32 - %10 = arith.extf %7 : f16 to f32 - linalg.yield %7, %9, %10 : f16, f32, f32 - } -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) - return %2#0, %2#1, %2#2 : tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32> - } - func.func private @Unknown64(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown67(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant 4.900000e+01 : f16 - %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map4, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x512x7x7xi1>, tensor<4x512xf16>) outs(%0 : tensor<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_1: f16, %out: f16): - %2 = arith.divf %in_1, %cst_0 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @BatchNormGradOp65(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp66(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16> - return %2 : tensor<4x512x7x7xf16> + %0 = tensor.empty() : tensor<4xf16> + %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16> + %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16> + %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor + %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<512xf16> + %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) { + %21 = affine.min #map1(%arg3) + %22 = affine.min #map2(%arg3) + %23 = affine.apply #map3(%22, %21) + %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor + %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor into tensor<1x?xf16> + %dim = tensor.dim %expanded_10, %c1 : tensor<1x?xf16> + %24 = arith.cmpi ugt, %dim, %c0 : index + %25 = scf.if %24 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %26 = math.exp %25 : f16 + %27 = arith.addf %26, %cst : f16 + %dim_11 = tensor.dim %expanded_10, %c1 : tensor<1x?xf16> + %28 = arith.cmpi ugt, %dim_11, %c1 : index + %29 = scf.if %28 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %30 = math.exp %29 : f16 + %31 = arith.addf %27, %30 : f16 + %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor + %inserted = tensor.insert %31 into %extracted_slice_12[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<512xf16> + } + } {mapping = [#gpu.thread]} + %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16> + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<256xf16> + %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) { + %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<256xf16> + } + } {mapping = [#gpu.thread]} + %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<128xf16> + %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) { + %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<128xf16> + } + } {mapping = [#gpu.thread]} + %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf16> + %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) { + %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<64xf16> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf16> + %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) { + %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf16> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf16> + %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) { + %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<16xf16> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16> + %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf16> + %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) { + %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<8xf16> + } + } {mapping = [#gpu.thread]} + %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16> + %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf16> + %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) { + %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.thread]} + %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16> + %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf16> + %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) { + %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<2xf16> + } + } {mapping = [#gpu.thread]} + %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor) { + %21 = affine.apply #map4(%arg3) + %extracted = tensor.extract %19[%21] : tensor<2xf16> + %22 = arith.addf %extracted, %cst : f16 + %23 = affine.apply #map5(%arg3) + %extracted_9 = tensor.extract %19[%23] : tensor<2xf16> + %24 = arith.addf %extracted_9, %22 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[] [] [] : tensor to tensor + %inserted = tensor.insert %24 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.block]} + return %1 : tensor<4xf16> + } + func.func private @Unknown68(%arg0: tensor<4xf16>) -> tensor<4xf16> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %0 = tensor.empty() : tensor<4xf16> + %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<4xf16> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%2 : tensor) { + ^bb0(%in: f16, %out: f16): + %4 = math.log %in : f16 + linalg.yield %4 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor into tensor<4xf16> + scf.yield %inserted_slice : tensor<4xf16> + } + return %1 : tensor<4xf16> } - func.func private @ConvBackwardFilterOp67(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> + func.func private @Unknown69(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %0 = tensor.empty() : tensor<4x1000xf16> + %1:2 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0, %arg6 = %0) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) { + %2:2 = scf.for %arg7 = %c0 to %c1000 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) { + %extracted_slice = tensor.extract_slice %arg2[%arg4] [1] [1] : tensor<4xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<4xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg1[%arg4, %arg7] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor + %extracted_slice_2 = tensor.extract_slice %arg3[%arg4, %arg7] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor + %3 = tensor.empty() : tensor + %4:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %extracted_slice_2 : tensor, tensor, tensor, tensor) outs(%3, %3 : tensor, tensor) { + ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f16, %out: f16, %out_7: f16): + %5 = arith.subf %in_5, %in_4 : f16 + %6 = math.exp %5 : f16 + %7 = arith.mulf %6, %in : f16 + %8 = arith.subf %in_6, %7 : f16 + linalg.yield %5, %8 : f16, f16 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %4#0 into %arg8[%arg4, %arg7] [1, 1] [1, 1] : tensor into tensor<4x1000xf16> + %inserted_slice_3 = tensor.insert_slice %4#1 into %arg9[%arg4, %arg7] [1, 1] [1, 1] : tensor into tensor<4x1000xf16> + scf.yield %inserted_slice, %inserted_slice_3 : tensor<4x1000xf16>, tensor<4x1000xf16> + } + scf.yield %2#0, %2#1 : tensor<4x1000xf16>, tensor<4x1000xf16> + } + return %1#0, %1#1 : tensor<4x1000xf16>, tensor<4x1000xf16> } - func.func private @Unknown68(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown70(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 + %cst_0 = arith.constant 4.900000e+01 : f16 %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x512x7x7xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x512x7x7xf16>) { + %2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x512x7x7xf16>) { + %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x512x7x7xf16>) { + %4 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x512xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_2: i1, %out: f16): + %7 = arith.divf %in, %cst_0 : f16 + %8 = arith.select %in_2, %7, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xf16> + scf.yield %inserted_slice : tensor<4x512x7x7xf16> + } + scf.yield %4 : tensor<4x512x7x7xf16> + } + scf.yield %3 : tensor<4x512x7x7xf16> + } + scf.yield %2 : tensor<4x512x7x7xf16> + } return %1 : tensor<4x512x7x7xf16> } - func.func private @BatchNormGradOp69(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { + func.func private @BatchNormGradOp71(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> @@ -647,165 +1504,136 @@ module @IrToMhlo.2452 { %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> } - func.func private @ConvBackwardDataOp70(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp72(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16> return %2 : tensor<4x512x7x7xf16> } - func.func private @ConvBackwardFilterOp71(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp73(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16> return %1 : tensor<512x512x3x3xf16> } - func.func private @Unknown72(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown74(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x512x7x7xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x512x7x7xf16>) { + %2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x512x7x7xf16>) { + %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x512x7x7xf16>) { + %4 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: i1, %in_1: f16, %out: f16): + %7 = arith.select %in, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xf16> + scf.yield %inserted_slice : tensor<4x512x7x7xf16> + } + scf.yield %4 : tensor<4x512x7x7xf16> + } + scf.yield %3 : tensor<4x512x7x7xf16> + } + scf.yield %2 : tensor<4x512x7x7xf16> + } return %1 : tensor<4x512x7x7xf16> } - func.func private @BatchNormGradOp73(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp74(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16> - return %2 : tensor<4x512x7x7xf16> - } - func.func private @ConvBackwardFilterOp75(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown76(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown78(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x512x7x7xf16> + %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x512x7x7xf16>) { + %2 = scf.for %arg5 = %c0 to %c512 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x512x7x7xf16>) { + %3 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x512x7x7xf16>) { + %4 = scf.for %arg9 = %c0 to %c7 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16): + %7 = arith.addf %in, %in_2 : f16 + %8 = arith.select %in_3, %7, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xf16> + scf.yield %inserted_slice : tensor<4x512x7x7xf16> + } + scf.yield %4 : tensor<4x512x7x7xf16> + } + scf.yield %3 : tensor<4x512x7x7xf16> + } + scf.yield %2 : tensor<4x512x7x7xf16> + } return %1 : tensor<4x512x7x7xf16> } - func.func private @BatchNormGradOp77(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp78(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp84(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,512]{1,0,2,3}"} : (tensor<512x256x3x3xf16>) -> tensor<3x3x256x512xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,512]{1,0,2,3}"} : (tensor<3x3x256x512xf16>) -> tensor<3x3x256x512xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<3x3x256x512xf16>) -> tensor<4x256x14x14xf16> return %2 : tensor<4x256x14x14xf16> } - func.func private @ConvBackwardFilterOp79(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp85(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x256x512xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x512xf16>) -> tensor<512x256x3x3xf16> return %1 : tensor<512x256x3x3xf16> } - func.func private @BatchNormGradOp80(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32> - %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16> - return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - } - func.func private @ConvBackwardDataOp81(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp87(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,256,512]{1,0,2,3}"} : (tensor<512x256x1x1xf16>) -> tensor<1x1x256x512xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<1x1x256x512xf16>) -> tensor<4x256x14x14xf16> return %1 : tensor<4x256x14x14xf16> } - func.func private @ConvBackwardFilterOp82(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp88(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<1x1x256x512xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,256,1,1]{0,1,3,2}"} : (tensor<1x1x256x512xf16>) -> tensor<512x256x1x1xf16> return %1 : tensor<512x256x1x1xf16> } - func.func private @Unknown83(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown89(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @BatchNormGradOp84(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp85(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16> - return %2 : tensor<4x256x14x14xf16> - } - func.func private @ConvBackwardFilterOp86(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown87(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @BatchNormGradOp88(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp89(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16> - return %2 : tensor<4x256x14x14xf16> - } - func.func private @ConvBackwardFilterOp90(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown91(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x256x14x14xf16> + %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x256x14x14xf16>) { + %2 = scf.for %arg5 = %c0 to %c256 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x256x14x14xf16>) { + %3 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x256x14x14xf16>) { + %4 = scf.for %arg9 = %c0 to %c14 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xi1> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16): + %7 = arith.addf %in, %in_2 : f16 + %8 = arith.select %in_3, %7, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xf16> + scf.yield %inserted_slice : tensor<4x256x14x14xf16> + } + scf.yield %4 : tensor<4x256x14x14xf16> + } + scf.yield %3 : tensor<4x256x14x14xf16> + } + scf.yield %2 : tensor<4x256x14x14xf16> + } return %1 : tensor<4x256x14x14xf16> } - func.func private @BatchNormGradOp92(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { + func.func private @BatchNormGradOp90(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> @@ -813,135 +1641,103 @@ module @IrToMhlo.2452 { %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> } - func.func private @ConvBackwardDataOp93(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp91(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16> return %2 : tensor<4x256x14x14xf16> } - func.func private @ConvBackwardFilterOp94(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp92(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16> return %1 : tensor<256x256x3x3xf16> } - func.func private @Unknown95(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown93(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x256x14x14xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x256x14x14xf16>) { + %2 = scf.for %arg4 = %c0 to %c256 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x256x14x14xf16>) { + %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x256x14x14xf16>) { + %4 = scf.for %arg8 = %c0 to %c14 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xi1> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: i1, %in_1: f16, %out: f16): + %7 = arith.select %in, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xf16> + scf.yield %inserted_slice : tensor<4x256x14x14xf16> + } + scf.yield %4 : tensor<4x256x14x14xf16> + } + scf.yield %3 : tensor<4x256x14x14xf16> + } + scf.yield %2 : tensor<4x256x14x14xf16> + } return %1 : tensor<4x256x14x14xf16> } - func.func private @BatchNormGradOp96(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp97(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp103(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,256]{1,0,2,3}"} : (tensor<256x128x3x3xf16>) -> tensor<3x3x128x256xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,256]{1,0,2,3}"} : (tensor<3x3x128x256xf16>) -> tensor<3x3x128x256xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<3x3x128x256xf16>) -> tensor<4x128x28x28xf16> return %2 : tensor<4x128x28x28xf16> } - func.func private @ConvBackwardFilterOp98(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp104(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x128x256xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x256xf16>) -> tensor<256x128x3x3xf16> return %1 : tensor<256x128x3x3xf16> } - func.func private @BatchNormGradOp99(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32> - %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16> - return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - } - func.func private @ConvBackwardDataOp100(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp106(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,128,256]{1,0,2,3}"} : (tensor<256x128x1x1xf16>) -> tensor<1x1x128x256xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<1x1x128x256xf16>) -> tensor<4x128x28x28xf16> return %1 : tensor<4x128x28x28xf16> } - func.func private @ConvBackwardFilterOp101(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp107(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<1x1x128x256xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,128,1,1]{0,1,3,2}"} : (tensor<1x1x128x256xf16>) -> tensor<256x128x1x1xf16> return %1 : tensor<256x128x1x1xf16> } - func.func private @Unknown102(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown108(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @BatchNormGradOp103(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp104(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16> - return %2 : tensor<4x128x28x28xf16> - } - func.func private @ConvBackwardFilterOp105(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown106(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @BatchNormGradOp107(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp108(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16> - return %2 : tensor<4x128x28x28xf16> - } - func.func private @ConvBackwardFilterOp109(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown110(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x128x28x28xf16> + %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x128x28x28xf16>) { + %2 = scf.for %arg5 = %c0 to %c128 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x128x28x28xf16>) { + %3 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x128x28x28xf16>) { + %4 = scf.for %arg9 = %c0 to %c28 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xi1> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16): + %7 = arith.addf %in, %in_2 : f16 + %8 = arith.select %in_3, %7, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xf16> + scf.yield %inserted_slice : tensor<4x128x28x28xf16> + } + scf.yield %4 : tensor<4x128x28x28xf16> + } + scf.yield %3 : tensor<4x128x28x28xf16> + } + scf.yield %2 : tensor<4x128x28x28xf16> + } return %1 : tensor<4x128x28x28xf16> } - func.func private @BatchNormGradOp111(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { + func.func private @BatchNormGradOp109(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> @@ -949,135 +1745,103 @@ module @IrToMhlo.2452 { %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> } - func.func private @ConvBackwardDataOp112(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp110(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16> return %2 : tensor<4x128x28x28xf16> } - func.func private @ConvBackwardFilterOp113(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp111(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16> return %1 : tensor<128x128x3x3xf16> } - func.func private @Unknown114(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown112(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x128x28x28xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x128x28x28xf16>) { + %2 = scf.for %arg4 = %c0 to %c128 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x128x28x28xf16>) { + %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x128x28x28xf16>) { + %4 = scf.for %arg8 = %c0 to %c28 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xi1> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: i1, %in_1: f16, %out: f16): + %7 = arith.select %in, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xf16> + scf.yield %inserted_slice : tensor<4x128x28x28xf16> + } + scf.yield %4 : tensor<4x128x28x28xf16> + } + scf.yield %3 : tensor<4x128x28x28xf16> + } + scf.yield %2 : tensor<4x128x28x28xf16> + } return %1 : tensor<4x128x28x28xf16> } - func.func private @BatchNormGradOp115(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp116(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp122(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,128]{1,0,2,3}"} : (tensor<128x64x3x3xf16>) -> tensor<3x3x64x128xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,128]{1,0,2,3}"} : (tensor<3x3x64x128xf16>) -> tensor<3x3x64x128xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<3x3x64x128xf16>) -> tensor<4x64x56x56xf16> return %2 : tensor<4x64x56x56xf16> } - func.func private @ConvBackwardFilterOp117(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp123(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x64x128xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x128xf16>) -> tensor<128x64x3x3xf16> return %1 : tensor<128x64x3x3xf16> } - func.func private @BatchNormGradOp118(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32> - %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16> - return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - } - func.func private @ConvBackwardDataOp119(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp125(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,64,128]{1,0,2,3}"} : (tensor<128x64x1x1xf16>) -> tensor<1x1x64x128xf16> %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<1x1x64x128xf16>) -> tensor<4x64x56x56xf16> return %1 : tensor<4x64x56x56xf16> } - func.func private @ConvBackwardFilterOp120(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp126(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<1x1x64x128xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,64,1,1]{0,1,3,2}"} : (tensor<1x1x64x128xf16>) -> tensor<128x64x1x1xf16> return %1 : tensor<128x64x1x1xf16> } - func.func private @Unknown121(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown127(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @BatchNormGradOp122(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %1 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %2 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp123(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16> - return %2 : tensor<4x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp124(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown125(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @BatchNormGradOp126(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %1 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %2 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp127(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16> - return %2 : tensor<4x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp128(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown129(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x64x56x56xf16> + %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x64x56x56xf16>) { + %2 = scf.for %arg5 = %c0 to %c64 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x64x56x56xf16>) { + %3 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x64x56x56xf16>) { + %4 = scf.for %arg9 = %c0 to %c56 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xi1> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16): + %7 = arith.addf %in, %in_2 : f16 + %8 = arith.select %in_3, %7, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xf16> + scf.yield %inserted_slice : tensor<4x64x56x56xf16> + } + scf.yield %4 : tensor<4x64x56x56xf16> + } + scf.yield %3 : tensor<4x64x56x56xf16> + } + scf.yield %2 : tensor<4x64x56x56xf16> + } return %1 : tensor<4x64x56x56xf16> } - func.func private @BatchNormGradOp130(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { + func.func private @BatchNormGradOp128(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> %1 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> %2 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> @@ -1085,66 +1849,110 @@ module @IrToMhlo.2452 { %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> } - func.func private @ConvBackwardDataOp131(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { + func.func private @ConvBackwardDataOp129(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16> return %2 : tensor<4x64x56x56xf16> } - func.func private @ConvBackwardFilterOp132(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp130(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> return %1 : tensor<64x64x3x3xf16> } - func.func private @Unknown133(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown131(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x64x56x56xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x56x56xf16>) { + %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x56x56xf16>) { + %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x56x56xf16>) { + %4 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xi1> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: i1, %in_1: f16, %out: f16): + %7 = arith.select %in, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xf16> + scf.yield %inserted_slice : tensor<4x64x56x56xf16> + } + scf.yield %4 : tensor<4x64x56x56xf16> + } + scf.yield %3 : tensor<4x64x56x56xf16> + } + scf.yield %2 : tensor<4x64x56x56xf16> + } return %1 : tensor<4x64x56x56xf16> } - func.func private @BatchNormGradOp134(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { - %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> - %1 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %2 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32> - %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) - %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16> - return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - } - func.func private @ConvBackwardDataOp135(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} { - %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16> - %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16> - return %2 : tensor<4x64x56x56xf16> - } - func.func private @ConvBackwardFilterOp136(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { - %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16> - %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown137(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown143(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - linalg.yield %2 : f16 - } -> tensor<4x64x56x56xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x56x56xf16>) { + %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x56x56xf16>) { + %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x56x56xf16>) { + %4 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %7 = arith.addf %in, %in_1 : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xf16> + scf.yield %inserted_slice : tensor<4x64x56x56xf16> + } + scf.yield %4 : tensor<4x64x56x56xf16> + } + scf.yield %3 : tensor<4x64x56x56xf16> + } + scf.yield %2 : tensor<4x64x56x56xf16> + } return %1 : tensor<4x64x56x56xf16> } - func.func private @Unknown138(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown144(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + %c112 = arith.constant 112 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x112x112xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) outs(%0 : tensor<4x64x112x112xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x64x112x112xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x112x112xf16>) { + %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x112x112xf16>) { + %3 = scf.for %arg6 = %c0 to %c112 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x112x112xf16>) { + %4 = scf.for %arg8 = %c0 to %c112 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x112x112xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xi1> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: i1, %in_1: f16, %out: f16): + %7 = arith.select %in, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x112x112xf16> + scf.yield %inserted_slice : tensor<4x64x112x112xf16> + } + scf.yield %4 : tensor<4x64x112x112xf16> + } + scf.yield %3 : tensor<4x64x112x112xf16> + } + scf.yield %2 : tensor<4x64x112x112xf16> + } return %1 : tensor<4x64x112x112xf16> } - func.func private @BatchNormGradOp139(%arg0: tensor<4x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { + func.func private @BatchNormGradOp145(%arg0: tensor<4x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} { %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32> %1 = mhlo.convert %arg0 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32> %2 = mhlo.convert %arg2 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32> @@ -1152,15 +1960,246 @@ module @IrToMhlo.2452 { %3 = mhlo.convert %grad_operand : (tensor<4x64x112x112xf32>) -> tensor<4x64x112x112xf16> return %3, %grad_scale, %grad_offset : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32> } - func.func private @ConvBackwardFilterOp140(%arg0: tensor<4x3x224x224xf16>, %arg1: tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<3> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { + func.func private @ConvBackwardFilterOp146(%arg0: tensor<4x3x224x224xf16>, %arg1: tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<3> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} { %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[3, 2], [3, 2]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<7x7x3x64xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,3,7,7]{0,1,3,2}"} : (tensor<7x7x3x64xf16>) -> tensor<64x3x7x7xf16> return %1 : tensor<64x3x7x7xf16> } - func.func private @Unknown141(%arg0: tensor) -> tensor attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown147(%arg0: tensor<4x1000xf16>, %arg1: tensor<4x1000xf32>) -> tensor attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %cst_0 = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor + %collapsed = tensor.collapse_shape %arg0 [[0, 1]] : tensor<4x1000xf16> into tensor<4000xf16> + %collapsed_1 = tensor.collapse_shape %arg1 [[0, 1]] : tensor<4x1000xf32> into tensor<4000xf32> + %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<4000xf16> into tensor<32x125xf16> + %expanded_2 = tensor.expand_shape %collapsed_1 [[0, 1]] : tensor<4000xf32> into tensor<32x125xf32> + %1 = tensor.empty() : tensor<32xf32> + %2 = scf.forall (%arg2) in (32) shared_outs(%arg3 = %1) -> (tensor<32xf32>) { + %extracted_slice = tensor.extract_slice %expanded[%arg2, 0] [1, 125] [1, 1] : tensor<32x125xf16> to tensor<125xf16> + %expanded_3 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<125xf16> into tensor<1x125xf16> + %extracted_slice_4 = tensor.extract_slice %expanded_2[%arg2, 0] [1, 125] [1, 1] : tensor<32x125xf32> to tensor<125xf32> + %expanded_5 = tensor.expand_shape %extracted_slice_4 [[0, 1]] : tensor<125xf32> into tensor<1x125xf32> + %extracted_slice_6 = tensor.extract_slice %arg3[%arg2] [1] [1] : tensor<32xf32> to tensor + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<128xf32> + %5 = scf.forall (%arg4) in (128) shared_outs(%arg5 = %4) -> (tensor<128xf32>) { + %19 = affine.min #map8(%arg4) + %20 = affine.min #map9(%arg4) + %21 = affine.apply #map3(%20, %19) + %extracted_slice_13 = tensor.extract_slice %expanded_3[0, %19] [1, %21] [1, 1] : tensor<1x125xf16> to tensor + %expanded_14 = tensor.expand_shape %extracted_slice_13 [[0, 1]] : tensor into tensor<1x?xf16> + %extracted_slice_15 = tensor.extract_slice %expanded_5[0, %19] [1, %21] [1, 1] : tensor<1x125xf32> to tensor + %expanded_16 = tensor.expand_shape %extracted_slice_15 [[0, 1]] : tensor into tensor<1x?xf32> + %dim = tensor.dim %expanded_14, %c1 : tensor<1x?xf16> + %22 = arith.cmpi ugt, %dim, %c0 : index + %23 = scf.if %22 -> (f16) { + %extracted = tensor.extract %expanded_14[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %dim_17 = tensor.dim %expanded_16, %c1 : tensor<1x?xf32> + %24 = arith.cmpi ugt, %dim_17, %c0 : index + %25 = scf.if %24 -> (f32) { + %extracted = tensor.extract %expanded_16[%c0, %c0] : tensor<1x?xf32> + scf.yield %extracted : f32 + } else { + scf.yield %cst_0 : f32 + } + %26 = arith.extf %23 : f16 to f32 + %27 = arith.mulf %26, %25 : f32 + %28 = arith.addf %27, %cst_0 : f32 + %extracted_slice_18 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<128xf32> to tensor + %inserted = tensor.insert %28 into %extracted_slice_18[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<128xf32> + } + } {mapping = [#gpu.thread]} + %expanded_7 = tensor.expand_shape %5 [[0, 1]] : tensor<128xf32> into tensor<64x2xf32> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf32> + %7 = scf.forall (%arg4) in (64) shared_outs(%arg5 = %6) -> (tensor<64xf32>) { + %extracted = tensor.extract %expanded_7[%arg4, %c0] : tensor<64x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_7[%arg4, %c1] : tensor<64x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<64xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<64xf32> + } + } {mapping = [#gpu.thread]} + %expanded_8 = tensor.expand_shape %7 [[0, 1]] : tensor<64xf32> into tensor<32x2xf32> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf32> + %9 = scf.forall (%arg4) in (32) shared_outs(%arg5 = %8) -> (tensor<32xf32>) { + %extracted = tensor.extract %expanded_8[%arg4, %c0] : tensor<32x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_8[%arg4, %c1] : tensor<32x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<32xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<32xf32> + } + } {mapping = [#gpu.thread]} + %expanded_9 = tensor.expand_shape %9 [[0, 1]] : tensor<32xf32> into tensor<16x2xf32> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf32> + %11 = scf.forall (%arg4) in (16) shared_outs(%arg5 = %10) -> (tensor<16xf32>) { + %extracted = tensor.extract %expanded_9[%arg4, %c0] : tensor<16x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_9[%arg4, %c1] : tensor<16x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<16xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<16xf32> + } + } {mapping = [#gpu.thread]} + %expanded_10 = tensor.expand_shape %11 [[0, 1]] : tensor<16xf32> into tensor<8x2xf32> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf32> + %13 = scf.forall (%arg4) in (8) shared_outs(%arg5 = %12) -> (tensor<8xf32>) { + %extracted = tensor.extract %expanded_10[%arg4, %c0] : tensor<8x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_10[%arg4, %c1] : tensor<8x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<8xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<8xf32> + } + } {mapping = [#gpu.thread]} + %expanded_11 = tensor.expand_shape %13 [[0, 1]] : tensor<8xf32> into tensor<4x2xf32> + %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf32> + %15 = scf.forall (%arg4) in (4) shared_outs(%arg5 = %14) -> (tensor<4xf32>) { + %extracted = tensor.extract %expanded_11[%arg4, %c0] : tensor<4x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_11[%arg4, %c1] : tensor<4x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<4xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<4xf32> + } + } {mapping = [#gpu.thread]} + %expanded_12 = tensor.expand_shape %15 [[0, 1]] : tensor<4xf32> into tensor<2x2xf32> + %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf32> + %17 = scf.forall (%arg4) in (2) shared_outs(%arg5 = %16) -> (tensor<2xf32>) { + %extracted = tensor.extract %expanded_12[%arg4, %c0] : tensor<2x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_12[%arg4, %c1] : tensor<2x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<2xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<2xf32> + } + } {mapping = [#gpu.thread]} + %18 = scf.forall (%arg4) in (1) shared_outs(%arg5 = %extracted_slice_6) -> (tensor) { + %19 = affine.apply #map4(%arg4) + %extracted = tensor.extract %17[%19] : tensor<2xf32> + %20 = arith.addf %extracted, %cst_0 : f32 + %21 = affine.apply #map5(%arg4) + %extracted_13 = tensor.extract %17[%21] : tensor<2xf32> + %22 = arith.addf %extracted_13, %20 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[] [] [] : tensor to tensor + %inserted = tensor.insert %22 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %18 into %arg3[%arg2] [1] [1] : tensor into tensor<32xf32> + } + } {mapping = [#gpu.block]} + %3 = scf.forall (%arg2) in (1) shared_outs(%arg3 = %0) -> (tensor) { + %4 = affine.apply #map10(%arg2) + %extracted_slice = tensor.extract_slice %2[%4] [32] [1] : tensor<32xf32> to tensor<32xf32> + %expanded_3 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<32xf32> into tensor<32x1xf32> + %5 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf32> + %6 = scf.forall (%arg4) in (32) shared_outs(%arg5 = %5) -> (tensor<32xf32>) { + %extracted = tensor.extract %expanded_3[%arg4, %c0] : tensor<32x1xf32> + %16 = arith.addf %extracted, %cst_0 : f32 + %extracted_slice_8 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<32xf32> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<32xf32> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %6 [[0, 1]] : tensor<32xf32> into tensor<16x2xf32> + %7 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf32> + %8 = scf.forall (%arg4) in (16) shared_outs(%arg5 = %7) -> (tensor<16xf32>) { + %extracted = tensor.extract %expanded_4[%arg4, %c0] : tensor<16x2xf32> + %16 = arith.addf %extracted, %cst_0 : f32 + %extracted_8 = tensor.extract %expanded_4[%arg4, %c1] : tensor<16x2xf32> + %17 = arith.addf %extracted_8, %16 : f32 + %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<16xf32> to tensor + %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<16xf32> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %8 [[0, 1]] : tensor<16xf32> into tensor<8x2xf32> + %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf32> + %10 = scf.forall (%arg4) in (8) shared_outs(%arg5 = %9) -> (tensor<8xf32>) { + %extracted = tensor.extract %expanded_5[%arg4, %c0] : tensor<8x2xf32> + %16 = arith.addf %extracted, %cst_0 : f32 + %extracted_8 = tensor.extract %expanded_5[%arg4, %c1] : tensor<8x2xf32> + %17 = arith.addf %extracted_8, %16 : f32 + %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<8xf32> to tensor + %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<8xf32> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %10 [[0, 1]] : tensor<8xf32> into tensor<4x2xf32> + %11 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf32> + %12 = scf.forall (%arg4) in (4) shared_outs(%arg5 = %11) -> (tensor<4xf32>) { + %extracted = tensor.extract %expanded_6[%arg4, %c0] : tensor<4x2xf32> + %16 = arith.addf %extracted, %cst_0 : f32 + %extracted_8 = tensor.extract %expanded_6[%arg4, %c1] : tensor<4x2xf32> + %17 = arith.addf %extracted_8, %16 : f32 + %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<4xf32> to tensor + %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<4xf32> + } + } {mapping = [#gpu.thread]} + %expanded_7 = tensor.expand_shape %12 [[0, 1]] : tensor<4xf32> into tensor<2x2xf32> + %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf32> + %14 = scf.forall (%arg4) in (2) shared_outs(%arg5 = %13) -> (tensor<2xf32>) { + %extracted = tensor.extract %expanded_7[%arg4, %c0] : tensor<2x2xf32> + %16 = arith.addf %extracted, %cst_0 : f32 + %extracted_8 = tensor.extract %expanded_7[%arg4, %c1] : tensor<2x2xf32> + %17 = arith.addf %extracted_8, %16 : f32 + %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<2xf32> to tensor + %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<2xf32> + } + } {mapping = [#gpu.thread]} + %15 = scf.forall (%arg4) in (1) shared_outs(%arg5 = %arg3) -> (tensor) { + %16 = affine.apply #map4(%arg4) + %extracted = tensor.extract %14[%16] : tensor<2xf32> + %17 = arith.addf %extracted, %cst_0 : f32 + %18 = affine.apply #map5(%arg4) + %extracted_8 = tensor.extract %14[%18] : tensor<2xf32> + %19 = arith.addf %extracted_8, %17 : f32 + %extracted_slice_9 = tensor.extract_slice %arg5[] [] [] : tensor to tensor + %inserted = tensor.insert %19 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %15 into %arg3[] [] [] : tensor into tensor + } + } {mapping = [#gpu.block]} + return %3 : tensor + } + func.func private @Unknown148(%arg0: tensor) -> tensor attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 4.000000e+00 : f32 %0 = tensor.empty() : tensor - %1 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = []} ins(%arg0 : tensor) outs(%0 : tensor) { + %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%arg0 : tensor) outs(%0 : tensor) { ^bb0(%in: f32, %out: f32): %2 = arith.negf %in : f32 %3 = arith.divf %2, %cst : f32 @@ -1168,445 +2207,619 @@ module @IrToMhlo.2452 { } -> tensor return %1 : tensor } - func.func private @Unknown142(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown149(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x3x7x7xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf16>) outs(%0 : tensor<64x3x7x7xf32>) attrs = {xla_shape = "f32[64,3,7,7]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x3x7x7xf32> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf32>) { + %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf32>) { + %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf32>) { + %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x3x7x7xf32> + scf.yield %inserted_slice : tensor<64x3x7x7xf32> + } + scf.yield %4 : tensor<64x3x7x7xf32> + } + scf.yield %3 : tensor<64x3x7x7xf32> + } + scf.yield %2 : tensor<64x3x7x7xf32> + } return %1 : tensor<64x3x7x7xf32> } - func.func private @Unknown143(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown144(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown145(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown146(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown150(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x64x3x3xf32> + scf.yield %inserted_slice : tensor<64x64x3x3xf32> + } + scf.yield %4 : tensor<64x64x3x3xf32> + } + scf.yield %3 : tensor<64x64x3x3xf32> + } + scf.yield %2 : tensor<64x64x3x3xf32> + } return %1 : tensor<64x64x3x3xf32> } - func.func private @Unknown147(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown154(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf16>) outs(%0 : tensor<128x64x3x3xf32>) attrs = {xla_shape = "f32[128,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x64x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x3x3xf32> + scf.yield %inserted_slice : tensor<128x64x3x3xf32> + } + scf.yield %4 : tensor<128x64x3x3xf32> + } + scf.yield %3 : tensor<128x64x3x3xf32> + } + scf.yield %2 : tensor<128x64x3x3xf32> + } return %1 : tensor<128x64x3x3xf32> } - func.func private @Unknown148(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown155(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x128x3x3xf32> + scf.yield %inserted_slice : tensor<128x128x3x3xf32> + } + scf.yield %4 : tensor<128x128x3x3xf32> + } + scf.yield %3 : tensor<128x128x3x3xf32> + } + scf.yield %2 : tensor<128x128x3x3xf32> + } return %1 : tensor<128x128x3x3xf32> } - func.func private @Unknown149(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown156(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf16>) outs(%0 : tensor<128x64x1x1xf32>) attrs = {xla_shape = "f32[128,64,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x64x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x1x1xf32> + scf.yield %inserted_slice : tensor<128x64x1x1xf32> + } + scf.yield %2 : tensor<128x64x1x1xf32> + } return %1 : tensor<128x64x1x1xf32> } - func.func private @Unknown150(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> - return %1 : tensor<128x128x3x3xf32> - } - func.func private @Unknown151(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> - return %1 : tensor<128x128x3x3xf32> - } - func.func private @Unknown152(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown159(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf16>) outs(%0 : tensor<256x128x3x3xf32>) attrs = {xla_shape = "f32[256,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x128x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x3x3xf32> + scf.yield %inserted_slice : tensor<256x128x3x3xf32> + } + scf.yield %4 : tensor<256x128x3x3xf32> + } + scf.yield %3 : tensor<256x128x3x3xf32> + } + scf.yield %2 : tensor<256x128x3x3xf32> + } return %1 : tensor<256x128x3x3xf32> } - func.func private @Unknown153(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown160(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x256x3x3xf32> + scf.yield %inserted_slice : tensor<256x256x3x3xf32> + } + scf.yield %4 : tensor<256x256x3x3xf32> + } + scf.yield %3 : tensor<256x256x3x3xf32> + } + scf.yield %2 : tensor<256x256x3x3xf32> + } return %1 : tensor<256x256x3x3xf32> } - func.func private @Unknown154(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown161(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf16>) outs(%0 : tensor<256x128x1x1xf32>) attrs = {xla_shape = "f32[256,128,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x128x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x1x1xf32> + scf.yield %inserted_slice : tensor<256x128x1x1xf32> + } + scf.yield %2 : tensor<256x128x1x1xf32> + } return %1 : tensor<256x128x1x1xf32> } - func.func private @Unknown155(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> - return %1 : tensor<256x256x3x3xf32> - } - func.func private @Unknown156(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> - return %1 : tensor<256x256x3x3xf32> - } - func.func private @Unknown157(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown164(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf16>) outs(%0 : tensor<512x256x3x3xf32>) attrs = {xla_shape = "f32[512,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x256x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x3x3xf32> + scf.yield %inserted_slice : tensor<512x256x3x3xf32> + } + scf.yield %4 : tensor<512x256x3x3xf32> + } + scf.yield %3 : tensor<512x256x3x3xf32> + } + scf.yield %2 : tensor<512x256x3x3xf32> + } return %1 : tensor<512x256x3x3xf32> } - func.func private @Unknown158(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown165(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x512x3x3xf32> + scf.yield %inserted_slice : tensor<512x512x3x3xf32> + } + scf.yield %4 : tensor<512x512x3x3xf32> + } + scf.yield %3 : tensor<512x512x3x3xf32> + } + scf.yield %2 : tensor<512x512x3x3xf32> + } return %1 : tensor<512x512x3x3xf32> } - func.func private @Unknown159(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown166(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf16>) outs(%0 : tensor<512x256x1x1xf32>) attrs = {xla_shape = "f32[512,256,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x256x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x1x1xf32> + scf.yield %inserted_slice : tensor<512x256x1x1xf32> + } + scf.yield %2 : tensor<512x256x1x1xf32> + } return %1 : tensor<512x256x1x1xf32> } - func.func private @Unknown160(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> - return %1 : tensor<512x512x3x3xf32> - } - func.func private @Unknown161(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> - return %1 : tensor<512x512x3x3xf32> - } - func.func private @MatmulOp162(%arg0: tensor<4x512xf16>, %arg1: tensor<4x1000xf16>) -> tensor<1000x512xf16> attributes {__byre__lhs_contracting_dimension = 0 : i64, __byre__output_transpose, __byre__rhs_contracting_dimension = 0 : i64, byre_compute_name = "MatmulOp"} { + func.func private @MatmulOp169(%arg0: tensor<4x512xf16>, %arg1: tensor<4x1000xf16>) -> tensor<1000x512xf16> attributes {__byre__lhs_contracting_dimension = 0 : i64, __byre__output_transpose, __byre__rhs_contracting_dimension = 0 : i64, byre_compute_name = "MatmulOp"} { %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<512x1000xf16> %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>, xla_shape = "f16[1000,512]{0,1}"} : (tensor<512x1000xf16>) -> tensor<1000x512xf16> return %1 : tensor<1000x512xf16> } - func.func private @Unknown163(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown170(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1000x512xf32> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf16>) outs(%0 : tensor<1000x512xf32>) attrs = {xla_shape = "f32[1000,512]{0,1}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<1000x512xf32> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf32>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<1000x512xf32> + scf.yield %inserted_slice : tensor<1000x512xf32> + } + scf.yield %2 : tensor<1000x512xf32> + } return %1 : tensor<1000x512xf32> } - func.func private @Unknown164(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown171(%arg0: tensor<4x1000xf16>) -> tensor<1000xf32> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %cst_0 = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<1000xf32> - %1 = linalg.generic {indexing_maps = [#map6, #map6], iterator_types = ["parallel"]} ins(%arg0 : tensor<1000xf32>) outs(%0 : tensor<1000xf32>) { - ^bb0(%in: f32, %out: f32): - %2 = arith.truncf %in : f32 to f16 - %3 = arith.extf %2 : f16 to f32 - linalg.yield %3 : f32 - } -> tensor<1000xf32> + %1 = scf.forall (%arg1) in (32) shared_outs(%arg2 = %0) -> (tensor<1000xf32>) { + %2 = affine.min #map11(%arg1) + %3 = affine.apply #map10(%arg1) + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf32> + %5 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2x32xf32> + %6 = scf.forall (%arg3, %arg4) in (2, 32) shared_outs(%arg5 = %5) -> (tensor<2x32xf32>) { + %8 = affine.min #map12(%arg4, %arg1) + %9 = affine.min #map13(%arg4, %arg1) + %10 = affine.apply #map3(%9, %8) + %11 = arith.cmpi ugt, %10, %c0 : index + %12 = scf.if %11 -> (f16) { + %19 = affine.apply #map4(%arg3) + %20 = affine.apply #map14(%arg1)[%8] + %extracted = tensor.extract %arg0[%19, %20] : tensor<4x1000xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %13 = arith.extf %12 : f16 to f32 + %14 = arith.addf %13, %cst_0 : f32 + %15 = arith.cmpi ugt, %10, %c0 : index + %16 = scf.if %15 -> (f16) { + %19 = affine.apply #map5(%arg3) + %20 = affine.apply #map14(%arg1)[%8] + %extracted = tensor.extract %arg0[%19, %20] : tensor<4x1000xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %17 = arith.extf %16 : f16 to f32 + %18 = arith.addf %14, %17 : f32 + %extracted_slice_1 = tensor.extract_slice %arg5[%arg3, %arg4] [1, 1] [1, 1] : tensor<2x32xf32> to tensor + %inserted = tensor.insert %18 into %extracted_slice_1[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg3, %arg4] [1, 1] [1, 1] : tensor into tensor<2x32xf32> + } + } {mapping = [#gpu.thread, #gpu.thread]} + %7 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf32>) { + %extracted = tensor.extract %6[%c0, %arg3] : tensor<2x32xf32> + %8 = arith.addf %extracted, %cst_0 : f32 + %extracted_1 = tensor.extract %6[%c1, %arg3] : tensor<2x32xf32> + %9 = arith.addf %extracted_1, %8 : f32 + %extracted_slice_2 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf32> to tensor + %inserted = tensor.insert %9 into %extracted_slice_2[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf32> + } + } {mapping = [#gpu.thread]} + %extracted_slice = tensor.extract_slice %7[0] [%2] [1] : tensor<32xf32> to tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %extracted_slice into %arg2[%3] [%2] [1] : tensor into tensor<1000xf32> + } + } {mapping = [#gpu.block]} return %1 : tensor<1000xf32> } - func.func @main(%arg0: tensor<4x3x224x224xf32>, %arg1: tensor<4x1000xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64x64x3x3xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<64xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64x64x3x3xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64xf32>, %arg16: tensor<64xf32>, %arg17: tensor<64x64x3x3xf32>, %arg18: tensor<64xf32>, %arg19: tensor<64xf32>, %arg20: tensor<64xf32>, %arg21: tensor<64xf32>, %arg22: tensor<64x64x3x3xf32>, %arg23: tensor<64xf32>, %arg24: tensor<64xf32>, %arg25: tensor<64xf32>, %arg26: tensor<64xf32>, %arg27: tensor<128x64x3x3xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128xf32>, %arg31: tensor<128xf32>, %arg32: tensor<128x128x3x3xf32>, %arg33: tensor<128xf32>, %arg34: tensor<128xf32>, %arg35: tensor<128xf32>, %arg36: tensor<128xf32>, %arg37: tensor<128x64x1x1xf32>, %arg38: tensor<128xf32>, %arg39: tensor<128xf32>, %arg40: tensor<128xf32>, %arg41: tensor<128xf32>, %arg42: tensor<128x128x3x3xf32>, %arg43: tensor<128xf32>, %arg44: tensor<128xf32>, %arg45: tensor<128xf32>, %arg46: tensor<128xf32>, %arg47: tensor<128x128x3x3xf32>, %arg48: tensor<128xf32>, %arg49: tensor<128xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<256x128x3x3xf32>, %arg53: tensor<256xf32>, %arg54: tensor<256xf32>, %arg55: tensor<256xf32>, %arg56: tensor<256xf32>, %arg57: tensor<256x256x3x3xf32>, %arg58: tensor<256xf32>, %arg59: tensor<256xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256x128x1x1xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256x256x3x3xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<256xf32>, %arg71: tensor<256xf32>, %arg72: tensor<256x256x3x3xf32>, %arg73: tensor<256xf32>, %arg74: tensor<256xf32>, %arg75: tensor<256xf32>, %arg76: tensor<256xf32>, %arg77: tensor<512x256x3x3xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<512xf32>, %arg81: tensor<512xf32>, %arg82: tensor<512x512x3x3xf32>, %arg83: tensor<512xf32>, %arg84: tensor<512xf32>, %arg85: tensor<512xf32>, %arg86: tensor<512xf32>, %arg87: tensor<512x256x1x1xf32>, %arg88: tensor<512xf32>, %arg89: tensor<512xf32>, %arg90: tensor<512xf32>, %arg91: tensor<512xf32>, %arg92: tensor<512x512x3x3xf32>, %arg93: tensor<512xf32>, %arg94: tensor<512xf32>, %arg95: tensor<512xf32>, %arg96: tensor<512xf32>, %arg97: tensor<512x512x3x3xf32>, %arg98: tensor<512xf32>, %arg99: tensor<512xf32>, %arg100: tensor<512xf32>, %arg101: tensor<512xf32>, %arg102: tensor<1000x512xf32>, %arg103: tensor<1000xf32>) -> (tensor, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>) { - %0 = mhlo.constant dense<0.000000e+00> : tensor - %1 = mhlo.constant dense<0.000000e+00> : tensor - %2 = mhlo.constant dense<0xFC00> : tensor - %3 = call @Unknown0(%arg0) : (tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16> - %4 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> - %5 = mhlo.convolution(%3, %4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<4x64x112x112xf16> - %6 = call @BatchNormTrainingOp2(%5, %arg3, %arg4) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x112x112xf16> - %7 = call @Unknown3(%arg7) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %8 = call @Unknown4(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %9 = call @Unknown5(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %10 = call @Unknown6(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %11 = call @Unknown7(%arg37) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> - %12 = call @Unknown8(%arg27) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> - %13 = call @Unknown9(%arg32) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %14 = call @Unknown10(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %15 = call @Unknown11(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %16 = call @Unknown12(%arg62) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> - %17 = call @Unknown13(%arg52) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> - %18 = call @Unknown14(%arg57) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %19 = call @Unknown15(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %20 = call @Unknown16(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %21 = call @Unknown17(%arg87) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> - %22 = call @Unknown18(%arg77) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> - %23 = call @Unknown19(%arg82) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %24 = call @Unknown20(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %25 = call @Unknown21(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %26 = call @Unknown22(%arg1) : (tensor<4x1000xf32>) -> tensor<4x1000xf16> - %27 = call @Unknown23(%arg102) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> - %28 = mhlo.reduce(%26 init: %1) across dimensions = [1] : (tensor<4x1000xf16>, tensor) -> tensor<4xf16> - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor + func.func private @Unknown172(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index + %0 = tensor.empty() : tensor<1000xf32> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<1000xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %out: f32): + %4 = arith.truncf %in : f32 to f16 + %5 = arith.extf %4 : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor into tensor<1000xf32> + scf.yield %inserted_slice : tensor<1000xf32> } - %29:2 = call @Unknown24(%6) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) - %30 = "mhlo.reduce_window"(%29#0, %2) ({ + return %1 : tensor<1000xf32> + } + func.func @main(%arg0: tensor<4x3x224x224xf32>, %arg1: tensor<4x1000xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64x64x3x3xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<64xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64x64x3x3xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64xf32>, %arg16: tensor<64xf32>, %arg17: tensor<64x64x3x3xf32>, %arg18: tensor<64xf32>, %arg19: tensor<64xf32>, %arg20: tensor<64xf32>, %arg21: tensor<64xf32>, %arg22: tensor<64x64x3x3xf32>, %arg23: tensor<64xf32>, %arg24: tensor<64xf32>, %arg25: tensor<64xf32>, %arg26: tensor<64xf32>, %arg27: tensor<128x64x3x3xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128xf32>, %arg31: tensor<128xf32>, %arg32: tensor<128x128x3x3xf32>, %arg33: tensor<128xf32>, %arg34: tensor<128xf32>, %arg35: tensor<128xf32>, %arg36: tensor<128xf32>, %arg37: tensor<128x64x1x1xf32>, %arg38: tensor<128xf32>, %arg39: tensor<128xf32>, %arg40: tensor<128xf32>, %arg41: tensor<128xf32>, %arg42: tensor<128x128x3x3xf32>, %arg43: tensor<128xf32>, %arg44: tensor<128xf32>, %arg45: tensor<128xf32>, %arg46: tensor<128xf32>, %arg47: tensor<128x128x3x3xf32>, %arg48: tensor<128xf32>, %arg49: tensor<128xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<256x128x3x3xf32>, %arg53: tensor<256xf32>, %arg54: tensor<256xf32>, %arg55: tensor<256xf32>, %arg56: tensor<256xf32>, %arg57: tensor<256x256x3x3xf32>, %arg58: tensor<256xf32>, %arg59: tensor<256xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256x128x1x1xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256x256x3x3xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<256xf32>, %arg71: tensor<256xf32>, %arg72: tensor<256x256x3x3xf32>, %arg73: tensor<256xf32>, %arg74: tensor<256xf32>, %arg75: tensor<256xf32>, %arg76: tensor<256xf32>, %arg77: tensor<512x256x3x3xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<512xf32>, %arg81: tensor<512xf32>, %arg82: tensor<512x512x3x3xf32>, %arg83: tensor<512xf32>, %arg84: tensor<512xf32>, %arg85: tensor<512xf32>, %arg86: tensor<512xf32>, %arg87: tensor<512x256x1x1xf32>, %arg88: tensor<512xf32>, %arg89: tensor<512xf32>, %arg90: tensor<512xf32>, %arg91: tensor<512xf32>, %arg92: tensor<512x512x3x3xf32>, %arg93: tensor<512xf32>, %arg94: tensor<512xf32>, %arg95: tensor<512xf32>, %arg96: tensor<512xf32>, %arg97: tensor<512x512x3x3xf32>, %arg98: tensor<512xf32>, %arg99: tensor<512xf32>, %arg100: tensor<512xf32>, %arg101: tensor<512xf32>, %arg102: tensor<1000x512xf32>, %arg103: tensor<1000xf32>) -> (tensor, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>) { + %0 = mhlo.constant dense<0.000000e+00> : tensor + %1 = mhlo.constant dense<0xFC00> : tensor + %2 = call @Unknown0(%arg0) : (tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16> + %3 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> + %4 = mhlo.convolution(%2, %3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<4x64x112x112xf16> + %5 = call @BatchNormTrainingOp2(%4, %arg3, %arg4) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x112x112xf16> + %6 = call @Unknown3(%arg7) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %7 = call @Unknown3(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %8 = call @Unknown3(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %9 = call @Unknown3(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %10 = call @Unknown7(%arg37) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> + %11 = call @Unknown8(%arg27) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> + %12 = call @Unknown9(%arg32) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %13 = call @Unknown9(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %14 = call @Unknown9(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %15 = call @Unknown12(%arg62) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> + %16 = call @Unknown13(%arg52) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> + %17 = call @Unknown14(%arg57) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %18 = call @Unknown14(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %19 = call @Unknown14(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %20 = call @Unknown17(%arg87) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> + %21 = call @Unknown18(%arg77) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> + %22 = call @Unknown19(%arg82) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %23 = call @Unknown19(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %24 = call @Unknown19(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %25 = call @Unknown22(%arg1) : (tensor<4x1000xf32>) -> tensor<4x1000xf16> + %26 = call @Unknown23(%arg102) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> + %27 = call @Unknown24(%arg103) : (tensor<1000xf32>) -> tensor<1000xf16> + %28 = call @Unknown25(%25) : (tensor<4x1000xf16>) -> tensor<4xf16> + %29:2 = call @Unknown26(%5) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) + %30 = "mhlo.reduce_window"(%29#0, %1) ({ ^bb0(%arg104: tensor, %arg105: tensor): - %198 = mhlo.maximum %arg104, %arg105 : tensor - mhlo.return %198 : tensor + %199 = mhlo.maximum %arg104, %arg105 : tensor + mhlo.return %199 : tensor }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<4x64x112x112xf16>, tensor) -> tensor<4x64x56x56xf16> - %31 = mhlo.convolution(%30, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %32 = call @BatchNormTrainingOp25(%31, %arg8, %arg9) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> - %33:2 = call @Unknown26(%32) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - %34 = mhlo.convolution(%33#0, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %31 = mhlo.convolution(%30, %6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %32 = call @BatchNormTrainingOp27(%31, %arg8, %arg9) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> + %33:2 = call @Unknown28(%32) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %34 = mhlo.convolution(%33#0, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> %35 = call @BatchNormTrainingOp27(%34, %arg13, %arg14) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> - %36:2 = call @Unknown28(%35, %30) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - %37 = mhlo.convolution(%36#0, %9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %38 = call @BatchNormTrainingOp29(%37, %arg18, %arg19) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> - %39:2 = call @Unknown30(%38) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - %40 = mhlo.convolution(%39#0, %10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %41 = call @BatchNormTrainingOp31(%40, %arg23, %arg24) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> - %42:2 = call @Unknown32(%41, %36#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - %43 = mhlo.convolution(%42#0, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<4x128x28x28xf16> - %44 = call @BatchNormTrainingOp33(%43, %arg38, %arg39) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> - %45 = mhlo.convolution(%42#0, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<4x128x28x28xf16> - %46 = call @BatchNormTrainingOp34(%45, %arg28, %arg29) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> - %47:2 = call @Unknown35(%46) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - %48 = mhlo.convolution(%47#0, %13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %49 = call @BatchNormTrainingOp36(%48, %arg33, %arg34) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> - %50:2 = call @Unknown37(%49, %44) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - %51 = mhlo.convolution(%50#0, %14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %52 = call @BatchNormTrainingOp38(%51, %arg43, %arg44) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> - %53:2 = call @Unknown39(%52) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - %54 = mhlo.convolution(%53#0, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %55 = call @BatchNormTrainingOp40(%54, %arg48, %arg49) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> - %56:2 = call @Unknown41(%55, %50#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - %57 = mhlo.convolution(%56#0, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<4x256x14x14xf16> - %58 = call @BatchNormTrainingOp42(%57, %arg63, %arg64) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> - %59 = mhlo.convolution(%56#0, %17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<4x256x14x14xf16> - %60 = call @BatchNormTrainingOp43(%59, %arg53, %arg54) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> - %61:2 = call @Unknown44(%60) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - %62 = mhlo.convolution(%61#0, %18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %63 = call @BatchNormTrainingOp45(%62, %arg58, %arg59) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> - %64:2 = call @Unknown46(%63, %58) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - %65 = mhlo.convolution(%64#0, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %66 = call @BatchNormTrainingOp47(%65, %arg68, %arg69) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> - %67:2 = call @Unknown48(%66) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - %68 = mhlo.convolution(%67#0, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %69 = call @BatchNormTrainingOp49(%68, %arg73, %arg74) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> - %70:2 = call @Unknown50(%69, %64#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - %71 = mhlo.convolution(%70#0, %21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<4x512x7x7xf16> - %72 = call @BatchNormTrainingOp51(%71, %arg88, %arg89) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> - %73 = mhlo.convolution(%70#0, %22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<4x512x7x7xf16> - %74 = call @BatchNormTrainingOp52(%73, %arg78, %arg79) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> - %75:2 = call @Unknown53(%74) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - %76 = mhlo.convolution(%75#0, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %77 = call @BatchNormTrainingOp54(%76, %arg83, %arg84) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> - %78:2 = call @Unknown55(%77, %72) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - %79 = mhlo.convolution(%78#0, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %80 = call @BatchNormTrainingOp56(%79, %arg93, %arg94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> - %81:2 = call @Unknown57(%80) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - %82 = mhlo.convolution(%81#0, %25) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %83 = call @BatchNormTrainingOp58(%82, %arg98, %arg99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> - %84:2 = call @Unknown59(%83, %78#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - %85 = mhlo.reduce(%84#0 init: %1) across dimensions = [3, 2] : (tensor<4x512x7x7xf16>, tensor) -> tensor<4x512xf16> - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %86 = call @Unknown60(%85) : (tensor<4x512xf16>) -> tensor<4x512xf16> - %87 = "mhlo.dot_general"(%86, %27) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<4x512xf16>, tensor<1000x512xf16>) -> tensor<4x1000xf16> - %88 = call @Unknown61(%arg103, %87) : (tensor<1000xf32>, tensor<4x1000xf16>) -> tensor<4x1000xf16> - %89 = mhlo.reduce(%88 init: %2) across dimensions = [1] : (tensor<4x1000xf16>, tensor) -> tensor<4xf16> - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.maximum %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %90:2 = call @Unknown62(%89, %88) : (tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) - %91 = mhlo.reduce(%90#1 init: %1) across dimensions = [1] : (tensor<4x1000xf16>, tensor) -> tensor<4xf16> - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %92:3 = call @Unknown63(%91, %90#0, %28, %26, %arg1) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>, tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) - %93 = "mhlo.dot"(%92#0, %27) {precision_config = [#mhlo, #mhlo]} : (tensor<4x1000xf16>, tensor<1000x512xf16>) -> tensor<4x512xf16> - %94 = call @Unknown64(%93, %84#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> - %95:3 = call @BatchNormGradOp65(%82, %arg98, %94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %96 = call @ConvBackwardDataOp66(%95#0, %25) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %97 = call @ConvBackwardFilterOp67(%81#0, %95#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %98 = call @Unknown68(%81#1, %96) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> - %99:3 = call @BatchNormGradOp69(%79, %arg93, %98) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %100 = call @ConvBackwardDataOp70(%99#0, %24) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %101 = call @ConvBackwardFilterOp71(%78#0, %99#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %102 = call @Unknown72(%94, %100, %78#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> - %103:3 = call @BatchNormGradOp73(%76, %arg83, %102) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %104 = call @ConvBackwardDataOp74(%103#0, %23) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> - %105 = call @ConvBackwardFilterOp75(%75#0, %103#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> - %106 = call @Unknown76(%75#1, %104) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> - %107:3 = call @BatchNormGradOp77(%73, %arg78, %106) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %108 = call @ConvBackwardDataOp78(%107#0, %22) : (tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %109 = call @ConvBackwardFilterOp79(%70#0, %107#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> - %110:3 = call @BatchNormGradOp80(%71, %arg88, %102) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) - %111 = call @ConvBackwardDataOp81(%110#0, %21) : (tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> - %112 = call @ConvBackwardFilterOp82(%70#0, %110#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> - %113 = call @Unknown83(%111, %108, %70#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> - %114:3 = call @BatchNormGradOp84(%68, %arg73, %113) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %115 = call @ConvBackwardDataOp85(%114#0, %20) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %116 = call @ConvBackwardFilterOp86(%67#0, %114#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %117 = call @Unknown87(%67#1, %115) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> - %118:3 = call @BatchNormGradOp88(%65, %arg68, %117) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %119 = call @ConvBackwardDataOp89(%118#0, %19) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %120 = call @ConvBackwardFilterOp90(%64#0, %118#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %121 = call @Unknown91(%113, %119, %64#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> - %122:3 = call @BatchNormGradOp92(%62, %arg58, %121) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %123 = call @ConvBackwardDataOp93(%122#0, %18) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> - %124 = call @ConvBackwardFilterOp94(%61#0, %122#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> - %125 = call @Unknown95(%61#1, %123) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> - %126:3 = call @BatchNormGradOp96(%59, %arg53, %125) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %127 = call @ConvBackwardDataOp97(%126#0, %17) : (tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %128 = call @ConvBackwardFilterOp98(%56#0, %126#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> - %129:3 = call @BatchNormGradOp99(%57, %arg63, %121) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) - %130 = call @ConvBackwardDataOp100(%129#0, %16) : (tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> - %131 = call @ConvBackwardFilterOp101(%56#0, %129#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> - %132 = call @Unknown102(%130, %127, %56#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> - %133:3 = call @BatchNormGradOp103(%54, %arg48, %132) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %134 = call @ConvBackwardDataOp104(%133#0, %15) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %135 = call @ConvBackwardFilterOp105(%53#0, %133#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %136 = call @Unknown106(%53#1, %134) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> - %137:3 = call @BatchNormGradOp107(%51, %arg43, %136) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %138 = call @ConvBackwardDataOp108(%137#0, %14) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %139 = call @ConvBackwardFilterOp109(%50#0, %137#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %140 = call @Unknown110(%132, %138, %50#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> - %141:3 = call @BatchNormGradOp111(%48, %arg33, %140) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %142 = call @ConvBackwardDataOp112(%141#0, %13) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> - %143 = call @ConvBackwardFilterOp113(%47#0, %141#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> - %144 = call @Unknown114(%47#1, %142) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> - %145:3 = call @BatchNormGradOp115(%45, %arg28, %144) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %146 = call @ConvBackwardDataOp116(%145#0, %12) : (tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %147 = call @ConvBackwardFilterOp117(%42#0, %145#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> - %148:3 = call @BatchNormGradOp118(%43, %arg38, %140) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) - %149 = call @ConvBackwardDataOp119(%148#0, %11) : (tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> - %150 = call @ConvBackwardFilterOp120(%42#0, %148#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> - %151 = call @Unknown121(%149, %146, %42#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> - %152:3 = call @BatchNormGradOp122(%40, %arg23, %151) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %153 = call @ConvBackwardDataOp123(%152#0, %10) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %154 = call @ConvBackwardFilterOp124(%39#0, %152#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %155 = call @Unknown125(%39#1, %153) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> - %156:3 = call @BatchNormGradOp126(%37, %arg18, %155) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %157 = call @ConvBackwardDataOp127(%156#0, %9) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %158 = call @ConvBackwardFilterOp128(%36#0, %156#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %159 = call @Unknown129(%151, %157, %36#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> - %160:3 = call @BatchNormGradOp130(%34, %arg13, %159) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %161 = call @ConvBackwardDataOp131(%160#0, %8) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %162 = call @ConvBackwardFilterOp132(%33#0, %160#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %163 = call @Unknown133(%33#1, %161) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> - %164:3 = call @BatchNormGradOp134(%31, %arg8, %163) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) - %165 = call @ConvBackwardDataOp135(%164#0, %7) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> - %166 = call @ConvBackwardFilterOp136(%30, %164#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> - %167 = call @Unknown137(%159, %165) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> - %168 = "mhlo.select_and_scatter"(%29#0, %167, %1) ({ + %36:2 = call @Unknown30(%35, %30) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %37 = mhlo.convolution(%36#0, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %38 = call @BatchNormTrainingOp27(%37, %arg18, %arg19) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> + %39:2 = call @Unknown28(%38) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %40 = mhlo.convolution(%39#0, %9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %41 = call @BatchNormTrainingOp27(%40, %arg23, %arg24) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16> + %42:2 = call @Unknown30(%41, %36#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %43 = mhlo.convolution(%42#0, %10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<4x128x28x28xf16> + %44 = call @BatchNormTrainingOp35(%43, %arg38, %arg39) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> + %45 = mhlo.convolution(%42#0, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<4x128x28x28xf16> + %46 = call @BatchNormTrainingOp35(%45, %arg28, %arg29) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> + %47:2 = call @Unknown37(%46) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %48 = mhlo.convolution(%47#0, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %49 = call @BatchNormTrainingOp35(%48, %arg33, %arg34) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> + %50:2 = call @Unknown39(%49, %44) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %51 = mhlo.convolution(%50#0, %13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %52 = call @BatchNormTrainingOp35(%51, %arg43, %arg44) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> + %53:2 = call @Unknown37(%52) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %54 = mhlo.convolution(%53#0, %14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %55 = call @BatchNormTrainingOp35(%54, %arg48, %arg49) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16> + %56:2 = call @Unknown39(%55, %50#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %57 = mhlo.convolution(%56#0, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<4x256x14x14xf16> + %58 = call @BatchNormTrainingOp44(%57, %arg63, %arg64) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> + %59 = mhlo.convolution(%56#0, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<4x256x14x14xf16> + %60 = call @BatchNormTrainingOp44(%59, %arg53, %arg54) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> + %61:2 = call @Unknown46(%60) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %62 = mhlo.convolution(%61#0, %17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %63 = call @BatchNormTrainingOp44(%62, %arg58, %arg59) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> + %64:2 = call @Unknown48(%63, %58) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %65 = mhlo.convolution(%64#0, %18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %66 = call @BatchNormTrainingOp44(%65, %arg68, %arg69) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> + %67:2 = call @Unknown46(%66) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %68 = mhlo.convolution(%67#0, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %69 = call @BatchNormTrainingOp44(%68, %arg73, %arg74) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16> + %70:2 = call @Unknown48(%69, %64#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %71 = mhlo.convolution(%70#0, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<4x512x7x7xf16> + %72 = call @BatchNormTrainingOp53(%71, %arg88, %arg89) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> + %73 = mhlo.convolution(%70#0, %21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<4x512x7x7xf16> + %74 = call @BatchNormTrainingOp53(%73, %arg78, %arg79) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> + %75:2 = call @Unknown55(%74) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %76 = mhlo.convolution(%75#0, %22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %77 = call @BatchNormTrainingOp53(%76, %arg83, %arg84) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> + %78:2 = call @Unknown57(%77, %72) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %79 = mhlo.convolution(%78#0, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %80 = call @BatchNormTrainingOp53(%79, %arg93, %arg94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> + %81:2 = call @Unknown55(%80) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %82 = mhlo.convolution(%81#0, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo, #mhlo]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %83 = call @BatchNormTrainingOp53(%82, %arg98, %arg99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16> + %84:2 = call @Unknown57(%83, %78#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %85 = call @Unknown62(%84#0) : (tensor<4x512x7x7xf16>) -> tensor<4x512xf16> + %86 = call @Unknown63(%85) : (tensor<4x512xf16>) -> tensor<4x512xf16> + %87 = "mhlo.dot_general"(%86, %26) {dot_dimension_numbers = #mhlo.dot, precision_config = [#mhlo, #mhlo]} : (tensor<4x512xf16>, tensor<1000x512xf16>) -> tensor<4x1000xf16> + %88 = call @Unknown64(%27, %87) : (tensor<1000xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16> + %89 = call @Unknown65(%88) : (tensor<4x1000xf16>) -> tensor<4xf16> + %90 = call @Unknown66(%89, %88) : (tensor<4xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16> + %91 = call @Unknown67(%90) : (tensor<4x1000xf16>) -> tensor<4xf16> + %92 = call @Unknown68(%91) : (tensor<4xf16>) -> tensor<4xf16> + %93:2 = call @Unknown69(%92, %90, %28, %25) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) + %94 = "mhlo.dot"(%93#1, %26) {precision_config = [#mhlo, #mhlo]} : (tensor<4x1000xf16>, tensor<1000x512xf16>) -> tensor<4x512xf16> + %95 = call @Unknown70(%94, %84#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> + %96:3 = call @BatchNormGradOp71(%82, %arg98, %95) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %97 = call @ConvBackwardDataOp72(%96#0, %24) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %98 = call @ConvBackwardFilterOp73(%81#0, %96#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %99 = call @Unknown74(%81#1, %97) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> + %100:3 = call @BatchNormGradOp71(%79, %arg93, %99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %101 = call @ConvBackwardDataOp72(%100#0, %23) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %102 = call @ConvBackwardFilterOp73(%78#0, %100#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %103 = call @Unknown78(%95, %101, %78#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> + %104:3 = call @BatchNormGradOp71(%76, %arg83, %103) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %105 = call @ConvBackwardDataOp72(%104#0, %22) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> + %106 = call @ConvBackwardFilterOp73(%75#0, %104#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> + %107 = call @Unknown74(%75#1, %105) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> + %108:3 = call @BatchNormGradOp71(%73, %arg78, %107) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %109 = call @ConvBackwardDataOp84(%108#0, %21) : (tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %110 = call @ConvBackwardFilterOp85(%70#0, %108#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> + %111:3 = call @BatchNormGradOp71(%71, %arg88, %103) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) + %112 = call @ConvBackwardDataOp87(%111#0, %20) : (tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> + %113 = call @ConvBackwardFilterOp88(%70#0, %111#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> + %114 = call @Unknown89(%112, %109, %70#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> + %115:3 = call @BatchNormGradOp90(%68, %arg73, %114) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %116 = call @ConvBackwardDataOp91(%115#0, %19) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %117 = call @ConvBackwardFilterOp92(%67#0, %115#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %118 = call @Unknown93(%67#1, %116) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> + %119:3 = call @BatchNormGradOp90(%65, %arg68, %118) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %120 = call @ConvBackwardDataOp91(%119#0, %18) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %121 = call @ConvBackwardFilterOp92(%64#0, %119#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %122 = call @Unknown89(%114, %120, %64#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> + %123:3 = call @BatchNormGradOp90(%62, %arg58, %122) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %124 = call @ConvBackwardDataOp91(%123#0, %17) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> + %125 = call @ConvBackwardFilterOp92(%61#0, %123#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> + %126 = call @Unknown93(%61#1, %124) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> + %127:3 = call @BatchNormGradOp90(%59, %arg53, %126) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %128 = call @ConvBackwardDataOp103(%127#0, %16) : (tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %129 = call @ConvBackwardFilterOp104(%56#0, %127#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> + %130:3 = call @BatchNormGradOp90(%57, %arg63, %122) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) + %131 = call @ConvBackwardDataOp106(%130#0, %15) : (tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> + %132 = call @ConvBackwardFilterOp107(%56#0, %130#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> + %133 = call @Unknown108(%131, %128, %56#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> + %134:3 = call @BatchNormGradOp109(%54, %arg48, %133) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %135 = call @ConvBackwardDataOp110(%134#0, %14) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %136 = call @ConvBackwardFilterOp111(%53#0, %134#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %137 = call @Unknown112(%53#1, %135) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> + %138:3 = call @BatchNormGradOp109(%51, %arg43, %137) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %139 = call @ConvBackwardDataOp110(%138#0, %13) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %140 = call @ConvBackwardFilterOp111(%50#0, %138#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %141 = call @Unknown108(%133, %139, %50#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> + %142:3 = call @BatchNormGradOp109(%48, %arg33, %141) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %143 = call @ConvBackwardDataOp110(%142#0, %12) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> + %144 = call @ConvBackwardFilterOp111(%47#0, %142#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> + %145 = call @Unknown112(%47#1, %143) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> + %146:3 = call @BatchNormGradOp109(%45, %arg28, %145) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %147 = call @ConvBackwardDataOp122(%146#0, %11) : (tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %148 = call @ConvBackwardFilterOp123(%42#0, %146#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> + %149:3 = call @BatchNormGradOp109(%43, %arg38, %141) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) + %150 = call @ConvBackwardDataOp125(%149#0, %10) : (tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> + %151 = call @ConvBackwardFilterOp126(%42#0, %149#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> + %152 = call @Unknown127(%150, %147, %42#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> + %153:3 = call @BatchNormGradOp128(%40, %arg23, %152) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %154 = call @ConvBackwardDataOp129(%153#0, %9) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %155 = call @ConvBackwardFilterOp130(%39#0, %153#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %156 = call @Unknown131(%39#1, %154) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> + %157:3 = call @BatchNormGradOp128(%37, %arg18, %156) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %158 = call @ConvBackwardDataOp129(%157#0, %8) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %159 = call @ConvBackwardFilterOp130(%36#0, %157#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %160 = call @Unknown127(%152, %158, %36#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> + %161:3 = call @BatchNormGradOp128(%34, %arg13, %160) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %162 = call @ConvBackwardDataOp129(%161#0, %7) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %163 = call @ConvBackwardFilterOp130(%33#0, %161#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %164 = call @Unknown131(%33#1, %162) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> + %165:3 = call @BatchNormGradOp128(%31, %arg8, %164) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) + %166 = call @ConvBackwardDataOp129(%165#0, %6) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> + %167 = call @ConvBackwardFilterOp130(%30, %165#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> + %168 = call @Unknown143(%160, %166) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> + %169 = "mhlo.select_and_scatter"(%29#0, %168, %0) ({ ^bb0(%arg104: tensor, %arg105: tensor): - %198 = mhlo.compare GE, %arg104, %arg105 : (tensor, tensor) -> tensor - mhlo.return %198 : tensor + %199 = mhlo.compare GE, %arg104, %arg105 : (tensor, tensor) -> tensor + mhlo.return %199 : tensor }, { ^bb0(%arg104: tensor, %arg105: tensor): - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor + %199 = mhlo.add %arg104, %arg105 : tensor + mhlo.return %199 : tensor }) {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<4x64x112x112xf16>, tensor<4x64x56x56xf16>, tensor) -> tensor<4x64x112x112xf16> - %169 = call @Unknown138(%29#1, %168) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> - %170:3 = call @BatchNormGradOp139(%5, %arg3, %169) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) - %171 = call @ConvBackwardFilterOp140(%3, %170#0) : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> - %172 = mhlo.reduce(%92#1 init: %0) across dimensions = [0, 1] : (tensor<4x1000xf32>, tensor) -> tensor - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %173 = call @Unknown141(%172) : (tensor) -> tensor - %174 = call @Unknown142(%171) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> - %175 = call @Unknown143(%166) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %176 = call @Unknown144(%162) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %177 = call @Unknown145(%158) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %178 = call @Unknown146(%154) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %179 = call @Unknown147(%147) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> - %180 = call @Unknown148(%143) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %181 = call @Unknown149(%150) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> - %182 = call @Unknown150(%139) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %183 = call @Unknown151(%135) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %184 = call @Unknown152(%128) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> - %185 = call @Unknown153(%124) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %186 = call @Unknown154(%131) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> - %187 = call @Unknown155(%120) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %188 = call @Unknown156(%116) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %189 = call @Unknown157(%109) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> - %190 = call @Unknown158(%105) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %191 = call @Unknown159(%112) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> - %192 = call @Unknown160(%101) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %193 = call @Unknown161(%97) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %194 = call @MatmulOp162(%86, %92#0) : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<1000x512xf16> - %195 = call @Unknown163(%194) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> - %196 = mhlo.reduce(%92#2 init: %0) across dimensions = [0] : (tensor<4x1000xf32>, tensor) -> tensor<1000xf32> - reducer(%arg104: tensor, %arg105: tensor) { - %198 = mhlo.add %arg104, %arg105 : tensor - mhlo.return %198 : tensor - } - %197 = call @Unknown164(%196) : (tensor<1000xf32>) -> tensor<1000xf32> - return %173, %174, %170#1, %170#2, %175, %164#1, %164#2, %176, %160#1, %160#2, %177, %156#1, %156#2, %178, %152#1, %152#2, %179, %145#1, %145#2, %180, %141#1, %141#2, %181, %148#1, %148#2, %182, %137#1, %137#2, %183, %133#1, %133#2, %184, %126#1, %126#2, %185, %122#1, %122#2, %186, %129#1, %129#2, %187, %118#1, %118#2, %188, %114#1, %114#2, %189, %107#1, %107#2, %190, %103#1, %103#2, %191, %110#1, %110#2, %192, %99#1, %99#2, %193, %95#1, %95#2, %195, %197 : tensor, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32> + %170 = call @Unknown144(%29#1, %169) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> + %171:3 = call @BatchNormGradOp145(%4, %arg3, %170) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) + %172 = call @ConvBackwardFilterOp146(%2, %171#0) : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> + %173 = call @Unknown147(%93#0, %arg1) : (tensor<4x1000xf16>, tensor<4x1000xf32>) -> tensor + %174 = call @Unknown148(%173) : (tensor) -> tensor + %175 = call @Unknown149(%172) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> + %176 = call @Unknown150(%167) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %177 = call @Unknown150(%163) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %178 = call @Unknown150(%159) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %179 = call @Unknown150(%155) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %180 = call @Unknown154(%148) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> + %181 = call @Unknown155(%144) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %182 = call @Unknown156(%151) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> + %183 = call @Unknown155(%140) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %184 = call @Unknown155(%136) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %185 = call @Unknown159(%129) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> + %186 = call @Unknown160(%125) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %187 = call @Unknown161(%132) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> + %188 = call @Unknown160(%121) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %189 = call @Unknown160(%117) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %190 = call @Unknown164(%110) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> + %191 = call @Unknown165(%106) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %192 = call @Unknown166(%113) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> + %193 = call @Unknown165(%102) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %194 = call @Unknown165(%98) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %195 = call @MatmulOp169(%86, %93#1) : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<1000x512xf16> + %196 = call @Unknown170(%195) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> + %197 = call @Unknown171(%93#1) : (tensor<4x1000xf16>) -> tensor<1000xf32> + %198 = call @Unknown172(%197) : (tensor<1000xf32>) -> tensor<1000xf32> + return %174, %175, %171#1, %171#2, %176, %165#1, %165#2, %177, %161#1, %161#2, %178, %157#1, %157#2, %179, %153#1, %153#2, %180, %146#1, %146#2, %181, %142#1, %142#2, %182, %149#1, %149#2, %183, %138#1, %138#2, %184, %134#1, %134#2, %185, %127#1, %127#2, %186, %123#1, %123#2, %187, %130#1, %130#2, %188, %119#1, %119#2, %189, %115#1, %115#2, %190, %108#1, %108#2, %191, %104#1, %104#2, %192, %111#1, %111#2, %193, %100#1, %100#2, %194, %96#1, %96#2, %196, %198 : tensor, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/Whole/4_bufferize_opt.mlir b/compiler/test/E2E/ResNet18/Whole/4_bufferize_opt.mlir index 92a4816fa..5a9ff6014 100644 --- a/compiler/test/E2E/ResNet18/Whole/4_bufferize_opt.mlir +++ b/compiler/test/E2E/ResNet18/Whole/4_bufferize_opt.mlir @@ -2,674 +2,2016 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1) -> (d0, d1)> -#map2 = affine_map<(d0, d1) -> (d1)> -#map3 = affine_map<(d0, d1) -> (d0)> -#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -#map5 = affine_map<() -> ()> -#map6 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> +#map1 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024, 1000)> +#map2 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024 + 2, 1000)> +#map3 = affine_map<(d0, d1) -> (d0 - d1)> +#map4 = affine_map<(d0) -> (d0 * 2)> +#map5 = affine_map<(d0) -> (d0 * 2 + 1)> +#map6 = affine_map<(d0) -> (d0 mod 64, 49)> +#map7 = affine_map<(d0) -> (d0 mod 64 + 1, 49)> +#map8 = affine_map<(d0) -> (d0 mod 128, 125)> +#map9 = affine_map<(d0) -> (d0 mod 128 + 1, 125)> +#map10 = affine_map<(d0) -> (d0 * 32)> +#map11 = affine_map<(d0) -> (d0 * -32 + 1000, 32)> +#map12 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0)> +#map13 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0 + 1)> +#map14 = affine_map<(d0)[s0] -> (d0 * 32 + s0)> module @IrToMhlo.2452 { func.func private @Unknown0(%arg0: tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16> attributes {__byteir_elementwise_fusion__} { + %c224 = arith.constant 224 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<4x3x224x224xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x3x224x224xf32>) outs(%0 : tensor<4x3x224x224xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<4x3x224x224xf16> + %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x3x224x224xf16>) { + %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x3x224x224xf16>) { + %3 = scf.for %arg5 = %c0 to %c224 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x3x224x224xf16>) { + %4 = scf.for %arg7 = %c0 to %c224 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x3x224x224xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x3x224x224xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x3x224x224xf16> + scf.yield %inserted_slice : tensor<4x3x224x224xf16> + } + scf.yield %4 : tensor<4x3x224x224xf16> + } + scf.yield %3 : tensor<4x3x224x224xf16> + } + scf.yield %2 : tensor<4x3x224x224xf16> + } return %1 : tensor<4x3x224x224xf16> } func.func private @Unknown1(%arg0: tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x3x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf32>) outs(%0 : tensor<64x3x7x7xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x3x7x7xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf16>) { + %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf16>) { + %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf16>) { + %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x3x7x7xf16> + scf.yield %inserted_slice : tensor<64x3x7x7xf16> + } + scf.yield %4 : tensor<64x3x7x7xf16> + } + scf.yield %3 : tensor<64x3x7x7xf16> + } + scf.yield %2 : tensor<64x3x7x7xf16> + } return %1 : tensor<64x3x7x7xf16> } func.func private @Unknown3(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown4(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown5(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> - return %1 : tensor<64x64x3x3xf16> - } - func.func private @Unknown6(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<64x64x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x64x3x3xf16> + scf.yield %inserted_slice : tensor<64x64x3x3xf16> + } + scf.yield %4 : tensor<64x64x3x3xf16> + } + scf.yield %3 : tensor<64x64x3x3xf16> + } + scf.yield %2 : tensor<64x64x3x3xf16> + } return %1 : tensor<64x64x3x3xf16> } func.func private @Unknown7(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf32>) outs(%0 : tensor<128x64x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x64x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x1x1xf16> + scf.yield %inserted_slice : tensor<128x64x1x1xf16> + } + scf.yield %2 : tensor<128x64x1x1xf16> + } return %1 : tensor<128x64x1x1xf16> } func.func private @Unknown8(%arg0: tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf32>) outs(%0 : tensor<128x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x64x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x3x3xf16> + scf.yield %inserted_slice : tensor<128x64x3x3xf16> + } + scf.yield %4 : tensor<128x64x3x3xf16> + } + scf.yield %3 : tensor<128x64x3x3xf16> + } + scf.yield %2 : tensor<128x64x3x3xf16> + } return %1 : tensor<128x64x3x3xf16> } func.func private @Unknown9(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown10(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> - return %1 : tensor<128x128x3x3xf16> - } - func.func private @Unknown11(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<128x128x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x128x3x3xf16> + scf.yield %inserted_slice : tensor<128x128x3x3xf16> + } + scf.yield %4 : tensor<128x128x3x3xf16> + } + scf.yield %3 : tensor<128x128x3x3xf16> + } + scf.yield %2 : tensor<128x128x3x3xf16> + } return %1 : tensor<128x128x3x3xf16> } func.func private @Unknown12(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf32>) outs(%0 : tensor<256x128x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x128x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x1x1xf16> + scf.yield %inserted_slice : tensor<256x128x1x1xf16> + } + scf.yield %2 : tensor<256x128x1x1xf16> + } return %1 : tensor<256x128x1x1xf16> } func.func private @Unknown13(%arg0: tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf32>) outs(%0 : tensor<256x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x128x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x3x3xf16> + scf.yield %inserted_slice : tensor<256x128x3x3xf16> + } + scf.yield %4 : tensor<256x128x3x3xf16> + } + scf.yield %3 : tensor<256x128x3x3xf16> + } + scf.yield %2 : tensor<256x128x3x3xf16> + } return %1 : tensor<256x128x3x3xf16> } func.func private @Unknown14(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown15(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> - return %1 : tensor<256x256x3x3xf16> - } - func.func private @Unknown16(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<256x256x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x256x3x3xf16> + scf.yield %inserted_slice : tensor<256x256x3x3xf16> + } + scf.yield %4 : tensor<256x256x3x3xf16> + } + scf.yield %3 : tensor<256x256x3x3xf16> + } + scf.yield %2 : tensor<256x256x3x3xf16> + } return %1 : tensor<256x256x3x3xf16> } func.func private @Unknown17(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x1x1xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf32>) outs(%0 : tensor<512x256x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x256x1x1xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x1x1xf16> + scf.yield %inserted_slice : tensor<512x256x1x1xf16> + } + scf.yield %2 : tensor<512x256x1x1xf16> + } return %1 : tensor<512x256x1x1xf16> } func.func private @Unknown18(%arg0: tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf32>) outs(%0 : tensor<512x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x256x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x3x3xf16> + scf.yield %inserted_slice : tensor<512x256x3x3xf16> + } + scf.yield %4 : tensor<512x256x3x3xf16> + } + scf.yield %3 : tensor<512x256x3x3xf16> + } + scf.yield %2 : tensor<512x256x3x3xf16> + } return %1 : tensor<512x256x3x3xf16> } func.func private @Unknown19(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown20(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> - return %1 : tensor<512x512x3x3xf16> - } - func.func private @Unknown21(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf16> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<512x512x3x3xf16> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf16>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf16>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf16>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf32> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f32, %out: f16): + %7 = arith.truncf %in : f32 to f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x512x3x3xf16> + scf.yield %inserted_slice : tensor<512x512x3x3xf16> + } + scf.yield %4 : tensor<512x512x3x3xf16> + } + scf.yield %3 : tensor<512x512x3x3xf16> + } + scf.yield %2 : tensor<512x512x3x3xf16> + } return %1 : tensor<512x512x3x3xf16> } func.func private @Unknown22(%arg0: tensor<4x1000xf32>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant -2.500000e-01 : f32 %0 = tensor.empty() : tensor<4x1000xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<4x1000xf32>) outs(%0 : tensor<4x1000xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.mulf %in, %cst : f32 - %3 = arith.truncf %2 : f32 to f16 - linalg.yield %3 : f16 - } -> tensor<4x1000xf16> + %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x1000xf16>) { + %2 = scf.for %arg3 = %c0 to %c1000 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x1000xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<4x1000xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.mulf %in, %cst : f32 + %6 = arith.truncf %5 : f32 to f16 + linalg.yield %6 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<4x1000xf16> + scf.yield %inserted_slice : tensor<4x1000xf16> + } + scf.yield %2 : tensor<4x1000xf16> + } return %1 : tensor<4x1000xf16> } func.func private @Unknown23(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1000x512xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf32>) outs(%0 : tensor<1000x512xf16>) { - ^bb0(%in: f32, %out: f16): - %2 = arith.truncf %in : f32 to f16 - linalg.yield %2 : f16 - } -> tensor<1000x512xf16> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf16>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf32> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f16): + %5 = arith.truncf %in : f32 to f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<1000x512xf16> + scf.yield %inserted_slice : tensor<1000x512xf16> + } + scf.yield %2 : tensor<1000x512xf16> + } return %1 : tensor<1000x512xf16> } - func.func private @Unknown24(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown24(%arg0: tensor<1000xf32>) -> tensor<1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index + %0 = tensor.empty() : tensor<1000xf16> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<1000xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %out: f16): + %4 = arith.truncf %in : f32 to f16 + linalg.yield %4 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor into tensor<1000xf16> + scf.yield %inserted_slice : tensor<1000xf16> + } + return %1 : tensor<1000xf16> + } + func.func private @Unknown25(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = tensor.empty() : tensor<4xf16> + %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16> + %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16> + %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor + %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<512xf16> + %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) { + %21 = affine.min #map1(%arg3) + %22 = affine.min #map2(%arg3) + %23 = affine.apply #map3(%22, %21) + %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor + %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor into tensor<1x?xf16> + %dim = tensor.dim %extracted_slice_9, %c0 : tensor + %24 = arith.cmpi ugt, %dim, %c0 : index + %25 = scf.if %24 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %26 = arith.addf %25, %cst : f16 + %dim_11 = tensor.dim %extracted_slice_9, %c0 : tensor + %27 = arith.cmpi ugt, %dim_11, %c1 : index + %28 = scf.if %27 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %29 = arith.addf %26, %28 : f16 + %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor + %inserted = tensor.insert %29 into %extracted_slice_12[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<512xf16> + } + } {mapping = [#gpu.thread]} + %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16> + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<256xf16> + %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) { + %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<256xf16> + } + } {mapping = [#gpu.thread]} + %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<128xf16> + %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) { + %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<128xf16> + } + } {mapping = [#gpu.thread]} + %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf16> + %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) { + %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<64xf16> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf16> + %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) { + %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf16> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf16> + %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) { + %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<16xf16> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16> + %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf16> + %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) { + %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<8xf16> + } + } {mapping = [#gpu.thread]} + %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16> + %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf16> + %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) { + %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.thread]} + %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16> + %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf16> + %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) { + %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<2xf16> + } + } {mapping = [#gpu.thread]} + %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor) { + %21 = affine.apply #map4(%arg3) + %extracted = tensor.extract %19[%21] : tensor<2xf16> + %22 = arith.addf %extracted, %cst : f16 + %23 = affine.apply #map5(%arg3) + %extracted_9 = tensor.extract %19[%23] : tensor<2xf16> + %24 = arith.addf %extracted_9, %22 : f16 + %inserted = tensor.insert %24 into %arg4[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.block]} + return %1 : tensor<4xf16> + } + func.func private @Unknown26(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { + %c112 = arith.constant 112 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x112x112xf16> %1 = tensor.empty() : tensor<4x64x112x112xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x112x112xf16>) outs(%0, %1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) + %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) { + %3:2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) { + %4:2 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) { + %5:2 = scf.for %arg10 = %c0 to %c112 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_1: i1): + %9 = arith.maximumf %in, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + linalg.yield %9, %10 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x112x112xf16> + %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x112x112xi1> + scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> + } + scf.yield %5#0, %5#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> + } + scf.yield %4#0, %4#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> + } + scf.yield %3#0, %3#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> + } return %2#0, %2#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1> } - func.func private @Unknown26(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = tensor.empty() : tensor<4x64x56x56xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> - } - func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = tensor.empty() : tensor<4x64x56x56xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) - return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> - } - func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x56x56xf16> %1 = tensor.empty() : tensor<4x64x56x56xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %3:2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %4:2 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %5:2 = scf.for %arg10 = %c0 to %c56 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_1: i1): + %9 = arith.maximumf %in, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + linalg.yield %9, %10 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xf16> + %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xi1> + scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %5#0, %5#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %4#0, %4#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %3#0, %3#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> } - func.func private @Unknown32(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x56x56xf16> %1 = tensor.empty() : tensor<4x64x56x56xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %3:2 = scf.for %arg5 = %c0 to %c64 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %4:2 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %5:2 = scf.for %arg11 = %c0 to %c56 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1): + %9 = arith.addf %in, %in_2 : f16 + %10 = arith.maximumf %9, %cst : f16 + %11 = arith.cmpf ogt, %10, %cst : f16 + linalg.yield %10, %11 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xf16> + %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xi1> + scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %5#0, %5#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %4#0, %4#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } + scf.yield %3#0, %3#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> + } return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1> } - func.func private @Unknown35(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = tensor.empty() : tensor<4x128x28x28xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> - } - func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x128x28x28xf16> %1 = tensor.empty() : tensor<4x128x28x28xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %3:2 = scf.for %arg4 = %c0 to %c128 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %4:2 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %5:2 = scf.for %arg10 = %c0 to %c28 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_1: i1): + %9 = arith.maximumf %in, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + linalg.yield %9, %10 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xf16> + %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xi1> + scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %5#0, %5#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %4#0, %4#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %3#0, %3#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> } - func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x128x28x28xf16> %1 = tensor.empty() : tensor<4x128x28x28xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %3:2 = scf.for %arg5 = %c0 to %c128 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %4:2 = scf.for %arg8 = %c0 to %c28 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %5:2 = scf.for %arg11 = %c0 to %c28 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1): + %9 = arith.addf %in, %in_2 : f16 + %10 = arith.maximumf %9, %cst : f16 + %11 = arith.cmpf ogt, %10, %cst : f16 + linalg.yield %10, %11 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xf16> + %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xi1> + scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %5#0, %5#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %4#0, %4#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } + scf.yield %3#0, %3#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> + } return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> } - func.func private @Unknown41(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = tensor.empty() : tensor<4x128x28x28xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) - return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1> - } - func.func private @Unknown44(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = tensor.empty() : tensor<4x256x14x14xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> - } - func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x256x14x14xf16> %1 = tensor.empty() : tensor<4x256x14x14xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %3:2 = scf.for %arg4 = %c0 to %c256 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %4:2 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %5:2 = scf.for %arg10 = %c0 to %c14 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_1: i1): + %9 = arith.maximumf %in, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + linalg.yield %9, %10 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xf16> + %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xi1> + scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %5#0, %5#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %4#0, %4#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %3#0, %3#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> } - func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x256x14x14xf16> %1 = tensor.empty() : tensor<4x256x14x14xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %3:2 = scf.for %arg5 = %c0 to %c256 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %4:2 = scf.for %arg8 = %c0 to %c14 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %5:2 = scf.for %arg11 = %c0 to %c14 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1): + %9 = arith.addf %in, %in_2 : f16 + %10 = arith.maximumf %9, %cst : f16 + %11 = arith.cmpf ogt, %10, %cst : f16 + linalg.yield %10, %11 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xf16> + %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xi1> + scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %5#0, %5#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %4#0, %4#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } + scf.yield %3#0, %3#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> + } return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> } - func.func private @Unknown50(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = tensor.empty() : tensor<4x256x14x14xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) - return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1> - } - func.func private @Unknown53(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x512x7x7xf16> %1 = tensor.empty() : tensor<4x512x7x7xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %3:2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %4:2 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %5:2 = scf.for %arg10 = %c0 to %c7 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_1: i1): + %9 = arith.maximumf %in, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + linalg.yield %9, %10 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xf16> + %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xi1> + scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %5#0, %5#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %4#0, %4#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %3#0, %3#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> } - func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x512x7x7xf16> %1 = tensor.empty() : tensor<4x512x7x7xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %3:2 = scf.for %arg5 = %c0 to %c512 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %4:2 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %5:2 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %6 = tensor.empty() : tensor + %7 = tensor.empty() : tensor + %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%6, %7 : tensor, tensor) { + ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1): + %9 = arith.addf %in, %in_2 : f16 + %10 = arith.maximumf %9, %cst : f16 + %11 = arith.cmpf ogt, %10, %cst : f16 + linalg.yield %10, %11 : f16, i1 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xf16> + %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xi1> + scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %5#0, %5#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %4#0, %4#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } + scf.yield %3#0, %3#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> + } return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> } - func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown62(%arg0: tensor<4x512x7x7xf16>) -> tensor<4x512xf16> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = tensor.empty() : tensor<4x512x7x7xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { - ^bb0(%in: f16, %out: f16, %out_0: i1): - %3 = arith.maxnumf %in, %cst : f16 - %4 = arith.cmpf ogt, %3, %cst : f16 - linalg.yield %3, %4 : f16, i1 - } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> - } - func.func private @Unknown59(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = tensor.empty() : tensor<4x512x7x7xi1> - %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1): - %3 = arith.addf %in, %in_0 : f16 - %4 = arith.maxnumf %3, %cst : f16 - %5 = arith.cmpf ogt, %4, %cst : f16 - linalg.yield %4, %5 : f16, i1 - } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1> - } - func.func private @Unknown60(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} { + %collapsed = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] : tensor<4x512x7x7xf16> into tensor<2048x49xf16> + %0 = tensor.empty() : tensor<2048xf16> + %1 = scf.forall (%arg1) in (2048) shared_outs(%arg2 = %0) -> (tensor<2048xf16>) { + %extracted_slice = tensor.extract_slice %collapsed[%arg1, 0] [1, 49] [1, 1] : tensor<2048x49xf16> to tensor<49xf16> + %expanded_0 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<49xf16> into tensor<1x49xf16> + %extracted_slice_1 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<2048xf16> to tensor + %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf16> + %3 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %2) -> (tensor<64xf16>) { + %15 = affine.min #map6(%arg3) + %16 = affine.min #map7(%arg3) + %17 = affine.apply #map3(%16, %15) + %extracted_slice_7 = tensor.extract_slice %expanded_0[0, %15] [1, %17] [1, 1] : tensor<1x49xf16> to tensor + %expanded_8 = tensor.expand_shape %extracted_slice_7 [[0, 1]] : tensor into tensor<1x?xf16> + %dim = tensor.dim %extracted_slice_7, %c0 : tensor + %18 = arith.cmpi ugt, %dim, %c0 : index + %19 = scf.if %18 -> (f16) { + %extracted = tensor.extract %expanded_8[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %20 = arith.addf %19, %cst : f16 + %extracted_slice_9 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor + %inserted = tensor.insert %20 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<64xf16> + } + } {mapping = [#gpu.thread]} + %expanded_2 = tensor.expand_shape %3 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16> + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf16> + %5 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf16>) { + %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<32x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_2[%arg3, %c1] : tensor<32x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf16> + } + } {mapping = [#gpu.thread]} + %expanded_3 = tensor.expand_shape %5 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf16> + %7 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %6) -> (tensor<16xf16>) { + %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<16x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_3[%arg3, %c1] : tensor<16x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<16xf16> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %7 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf16> + %9 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %8) -> (tensor<8xf16>) { + %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<8x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_4[%arg3, %c1] : tensor<8x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<8xf16> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %9 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf16> + %11 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %10) -> (tensor<4xf16>) { + %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<4x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_5[%arg3, %c1] : tensor<4x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %11 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf16> + %13 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %12) -> (tensor<2xf16>) { + %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<2x2xf16> + %15 = arith.addf %extracted, %cst : f16 + %extracted_7 = tensor.extract %expanded_6[%arg3, %c1] : tensor<2x2xf16> + %16 = arith.addf %extracted_7, %15 : f16 + %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<2xf16> + } + } {mapping = [#gpu.thread]} + %14 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_1) -> (tensor) { + %15 = affine.apply #map4(%arg3) + %extracted = tensor.extract %13[%15] : tensor<2xf16> + %16 = arith.addf %extracted, %cst : f16 + %17 = affine.apply #map5(%arg3) + %extracted_7 = tensor.extract %13[%17] : tensor<2xf16> + %18 = arith.addf %extracted_7, %16 : f16 + %inserted = tensor.insert %18 into %arg4[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %14 into %arg2[%arg1] [1] [1] : tensor into tensor<2048xf16> + } + } {mapping = [#gpu.block]} + %expanded = tensor.expand_shape %1 [[0, 1]] : tensor<2048xf16> into tensor<4x512xf16> + return %expanded : tensor<4x512xf16> + } + func.func private @Unknown63(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 2.040100e-02 : f16 %0 = tensor.empty() : tensor<4x512xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<4x512xf16>) outs(%0 : tensor<4x512xf16>) { - ^bb0(%in: f16, %out: f16): - %2 = arith.mulf %in, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x512xf16> + %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x512xf16>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x512xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<4x512xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f16): + %5 = arith.mulf %in, %cst : f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<4x512xf16> + scf.yield %inserted_slice : tensor<4x512xf16> + } + scf.yield %2 : tensor<4x512xf16> + } return %1 : tensor<4x512xf16> } - func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown64(%arg0: tensor<1000xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<4x1000xf16> - %1 = linalg.generic {indexing_maps = [#map1, #map2, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x1000xf16>, tensor<1000xf32>) outs(%0 : tensor<4x1000xf16>) { - ^bb0(%in: f16, %in_0: f32, %out: f16): - %2 = arith.truncf %in_0 : f32 to f16 - %3 = arith.addf %in, %2 : f16 - linalg.yield %3 : f16 - } -> tensor<4x1000xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x1000xf16>) { + %2 = scf.for %arg4 = %c0 to %c1000 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x1000xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<1000xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %5 = arith.addf %in_1, %in : f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor into tensor<4x1000xf16> + scf.yield %inserted_slice : tensor<4x1000xf16> + } + scf.yield %2 : tensor<4x1000xf16> + } return %1 : tensor<4x1000xf16> } - func.func private @Unknown62(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown65(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = tensor.empty() : tensor<4xf16> + %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16> + %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16> + %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor + %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<512xf16> + %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) { + %21 = affine.min #map1(%arg3) + %22 = affine.min #map2(%arg3) + %23 = affine.apply #map3(%22, %21) + %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor + %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor into tensor<1x?xf16> + %dim = tensor.dim %extracted_slice_9, %c0 : tensor + %24 = arith.cmpi ugt, %dim, %c0 : index + %25 = scf.if %24 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %dim_11 = tensor.dim %extracted_slice_9, %c0 : tensor + %26 = arith.cmpi ugt, %dim_11, %c1 : index + %27 = scf.if %26 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %28 = arith.maximumf %25, %27 : f16 + %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor + %inserted = tensor.insert %28 into %extracted_slice_12[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<512xf16> + } + } {mapping = [#gpu.thread]} + %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16> + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<256xf16> + %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) { + %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16> + %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<256xf16> + } + } {mapping = [#gpu.thread]} + %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<128xf16> + %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) { + %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16> + %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<128xf16> + } + } {mapping = [#gpu.thread]} + %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf16> + %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) { + %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16> + %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<64xf16> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf16> + %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) { + %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16> + %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf16> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf16> + %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) { + %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16> + %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<16xf16> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16> + %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf16> + %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) { + %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16> + %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<8xf16> + } + } {mapping = [#gpu.thread]} + %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16> + %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf16> + %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) { + %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16> + %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.thread]} + %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16> + %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf16> + %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) { + %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16> + %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16> + %21 = arith.maximumf %extracted_9, %extracted : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor + %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<2xf16> + } + } {mapping = [#gpu.thread]} + %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor) { + %21 = affine.apply #map4(%arg3) + %extracted = tensor.extract %19[%21] : tensor<2xf16> + %22 = affine.apply #map5(%arg3) + %extracted_9 = tensor.extract %19[%22] : tensor<2xf16> + %23 = arith.maximumf %extracted_9, %extracted : f16 + %inserted = tensor.insert %23 into %arg4[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.block]} + return %1 : tensor<4xf16> + } + func.func private @Unknown66(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<4x1000xf16> - %1:2 = linalg.generic {indexing_maps = [#map1, #map3, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x1000xf16>, tensor<4xf16>) outs(%0, %0 : tensor<4x1000xf16>, tensor<4x1000xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: f16): - %2 = arith.subf %in, %in_0 : f16 - %3 = math.exp %2 : f16 - linalg.yield %2, %3 : f16, f16 - } -> (tensor<4x1000xf16>, tensor<4x1000xf16>) - return %1#0, %1#1 : tensor<4x1000xf16>, tensor<4x1000xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x1000xf16>) { + %2 = scf.for %arg4 = %c0 to %c1000 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x1000xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<4xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %5 = arith.subf %in_1, %in : f16 + linalg.yield %5 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor into tensor<4x1000xf16> + scf.yield %inserted_slice : tensor<4x1000xf16> + } + scf.yield %2 : tensor<4x1000xf16> + } + return %1 : tensor<4x1000xf16> } - func.func private @Unknown63(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>, %arg4: tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown67(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = tensor.empty() : tensor<4xf16> + %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16> + %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16> + %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor + %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<512xf16> + %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) { + %21 = affine.min #map1(%arg3) + %22 = affine.min #map2(%arg3) + %23 = affine.apply #map3(%22, %21) + %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor + %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor into tensor<1x?xf16> + %dim = tensor.dim %extracted_slice_9, %c0 : tensor + %24 = arith.cmpi ugt, %dim, %c0 : index + %25 = scf.if %24 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %26 = math.exp %25 : f16 + %27 = arith.addf %26, %cst : f16 + %dim_11 = tensor.dim %extracted_slice_9, %c0 : tensor + %28 = arith.cmpi ugt, %dim_11, %c1 : index + %29 = scf.if %28 -> (f16) { + %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %30 = math.exp %29 : f16 + %31 = arith.addf %27, %30 : f16 + %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor + %inserted = tensor.insert %31 into %extracted_slice_12[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<512xf16> + } + } {mapping = [#gpu.thread]} + %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16> + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<256xf16> + %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) { + %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<256xf16> + } + } {mapping = [#gpu.thread]} + %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<128xf16> + %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) { + %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<128xf16> + } + } {mapping = [#gpu.thread]} + %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf16> + %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) { + %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<64xf16> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf16> + %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) { + %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf16> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf16> + %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) { + %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<16xf16> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16> + %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf16> + %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) { + %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<8xf16> + } + } {mapping = [#gpu.thread]} + %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16> + %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf16> + %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) { + %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.thread]} + %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16> + %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf16> + %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) { + %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16> + %21 = arith.addf %extracted, %cst : f16 + %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16> + %22 = arith.addf %extracted_9, %21 : f16 + %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor + %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<2xf16> + } + } {mapping = [#gpu.thread]} + %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor) { + %21 = affine.apply #map4(%arg3) + %extracted = tensor.extract %19[%21] : tensor<2xf16> + %22 = arith.addf %extracted, %cst : f16 + %23 = affine.apply #map5(%arg3) + %extracted_9 = tensor.extract %19[%23] : tensor<2xf16> + %24 = arith.addf %extracted_9, %22 : f16 + %inserted = tensor.insert %24 into %arg4[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor into tensor<4xf16> + } + } {mapping = [#gpu.block]} + return %1 : tensor<4xf16> + } + func.func private @Unknown68(%arg0: tensor<4xf16>) -> tensor<4xf16> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %0 = tensor.empty() : tensor<4xf16> + %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<4xf16> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%2 : tensor) { + ^bb0(%in: f16, %out: f16): + %4 = math.log %in : f16 + linalg.yield %4 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor into tensor<4xf16> + scf.yield %inserted_slice : tensor<4xf16> + } + return %1 : tensor<4xf16> + } + func.func private @Unknown69(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<4x1000xf16> - %1 = tensor.empty() : tensor<4x1000xf32> - %2:3 = linalg.generic {indexing_maps = [#map1, #map1, #map3, #map3, #map1, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3, %arg1, %arg0, %arg2, %arg4 : tensor<4x1000xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4xf16>, tensor<4x1000xf32>) outs(%0, %1, %1 : tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %in_2: f16, %in_3: f32, %out: f16, %out_4: f32, %out_5: f32): - %3 = math.log %in_1 : f16 - %4 = arith.subf %in_0, %3 : f16 - %5 = math.exp %4 : f16 - %6 = arith.mulf %5, %in_2 : f16 - %7 = arith.subf %in, %6 : f16 - %8 = arith.extf %4 : f16 to f32 - %9 = arith.mulf %8, %in_3 : f32 - %10 = arith.extf %7 : f16 to f32 - linalg.yield %7, %9, %10 : f16, f32, f32 - } -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) - return %2#0, %2#1, %2#2 : tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32> + %1:2 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0, %arg6 = %0) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) { + %2:2 = scf.for %arg7 = %c0 to %c1000 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) { + %extracted_slice = tensor.extract_slice %arg2[%arg4] [1] [1] : tensor<4xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<4xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg1[%arg4, %arg7] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor + %extracted_slice_2 = tensor.extract_slice %arg3[%arg4, %arg7] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor + %3 = tensor.empty() : tensor + %4:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %extracted_slice_2 : tensor, tensor, tensor, tensor) outs(%3, %3 : tensor, tensor) { + ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f16, %out: f16, %out_7: f16): + %5 = arith.subf %in_5, %in_4 : f16 + %6 = math.exp %5 : f16 + %7 = arith.mulf %6, %in : f16 + %8 = arith.subf %in_6, %7 : f16 + linalg.yield %5, %8 : f16, f16 + } -> (tensor, tensor) + %inserted_slice = tensor.insert_slice %4#0 into %arg8[%arg4, %arg7] [1, 1] [1, 1] : tensor into tensor<4x1000xf16> + %inserted_slice_3 = tensor.insert_slice %4#1 into %arg9[%arg4, %arg7] [1, 1] [1, 1] : tensor into tensor<4x1000xf16> + scf.yield %inserted_slice, %inserted_slice_3 : tensor<4x1000xf16>, tensor<4x1000xf16> + } + scf.yield %2#0, %2#1 : tensor<4x1000xf16>, tensor<4x1000xf16> + } + return %1#0, %1#1 : tensor<4x1000xf16>, tensor<4x1000xf16> } - func.func private @Unknown64(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown70(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %cst_0 = arith.constant 4.900000e+01 : f16 %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map4, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x512x7x7xi1>, tensor<4x512xf16>) outs(%0 : tensor<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_1: f16, %out: f16): - %2 = arith.divf %in_1, %cst_0 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x512x7x7xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x512x7x7xf16>) { + %2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x512x7x7xf16>) { + %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x512x7x7xf16>) { + %4 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x512xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_2: i1, %out: f16): + %7 = arith.divf %in, %cst_0 : f16 + %8 = arith.select %in_2, %7, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xf16> + scf.yield %inserted_slice : tensor<4x512x7x7xf16> + } + scf.yield %4 : tensor<4x512x7x7xf16> + } + scf.yield %3 : tensor<4x512x7x7xf16> + } + scf.yield %2 : tensor<4x512x7x7xf16> + } return %1 : tensor<4x512x7x7xf16> } - func.func private @Unknown68(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown74(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x512x7x7xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x512x7x7xf16>) { + %2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x512x7x7xf16>) { + %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x512x7x7xf16>) { + %4 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: i1, %in_1: f16, %out: f16): + %7 = arith.select %in, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xf16> + scf.yield %inserted_slice : tensor<4x512x7x7xf16> + } + scf.yield %4 : tensor<4x512x7x7xf16> + } + scf.yield %3 : tensor<4x512x7x7xf16> + } + scf.yield %2 : tensor<4x512x7x7xf16> + } return %1 : tensor<4x512x7x7xf16> } - func.func private @Unknown72(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown78(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x512x7x7xf16> + %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x512x7x7xf16>) { + %2 = scf.for %arg5 = %c0 to %c512 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x512x7x7xf16>) { + %3 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x512x7x7xf16>) { + %4 = scf.for %arg9 = %c0 to %c7 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x512x7x7xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16): + %7 = arith.addf %in, %in_2 : f16 + %8 = arith.select %in_3, %7, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x512x7x7xf16> + scf.yield %inserted_slice : tensor<4x512x7x7xf16> + } + scf.yield %4 : tensor<4x512x7x7xf16> + } + scf.yield %3 : tensor<4x512x7x7xf16> + } + scf.yield %2 : tensor<4x512x7x7xf16> + } return %1 : tensor<4x512x7x7xf16> } - func.func private @Unknown76(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x512x7x7xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x512x7x7xf16> - return %1 : tensor<4x512x7x7xf16> - } - func.func private @Unknown83(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown89(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x256x14x14xf16> + %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x256x14x14xf16>) { + %2 = scf.for %arg5 = %c0 to %c256 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x256x14x14xf16>) { + %3 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x256x14x14xf16>) { + %4 = scf.for %arg9 = %c0 to %c14 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xi1> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16): + %7 = arith.addf %in, %in_2 : f16 + %8 = arith.select %in_3, %7, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xf16> + scf.yield %inserted_slice : tensor<4x256x14x14xf16> + } + scf.yield %4 : tensor<4x256x14x14xf16> + } + scf.yield %3 : tensor<4x256x14x14xf16> + } + scf.yield %2 : tensor<4x256x14x14xf16> + } return %1 : tensor<4x256x14x14xf16> } - func.func private @Unknown87(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown93(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x256x14x14xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x256x14x14xf16>) { + %2 = scf.for %arg4 = %c0 to %c256 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x256x14x14xf16>) { + %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x256x14x14xf16>) { + %4 = scf.for %arg8 = %c0 to %c14 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x256x14x14xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xi1> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: i1, %in_1: f16, %out: f16): + %7 = arith.select %in, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x256x14x14xf16> + scf.yield %inserted_slice : tensor<4x256x14x14xf16> + } + scf.yield %4 : tensor<4x256x14x14xf16> + } + scf.yield %3 : tensor<4x256x14x14xf16> + } + scf.yield %2 : tensor<4x256x14x14xf16> + } return %1 : tensor<4x256x14x14xf16> } - func.func private @Unknown91(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @Unknown95(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x256x14x14xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x256x14x14xf16> - return %1 : tensor<4x256x14x14xf16> - } - func.func private @Unknown102(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @Unknown106(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown108(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x128x28x28xf16> + %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x128x28x28xf16>) { + %2 = scf.for %arg5 = %c0 to %c128 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x128x28x28xf16>) { + %3 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x128x28x28xf16>) { + %4 = scf.for %arg9 = %c0 to %c28 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xi1> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16): + %7 = arith.addf %in, %in_2 : f16 + %8 = arith.select %in_3, %7, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xf16> + scf.yield %inserted_slice : tensor<4x128x28x28xf16> + } + scf.yield %4 : tensor<4x128x28x28xf16> + } + scf.yield %3 : tensor<4x128x28x28xf16> + } + scf.yield %2 : tensor<4x128x28x28xf16> + } return %1 : tensor<4x128x28x28xf16> } - func.func private @Unknown110(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown112(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + %c28 = arith.constant 28 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x128x28x28xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x128x28x28xf16>) { + %2 = scf.for %arg4 = %c0 to %c128 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x128x28x28xf16>) { + %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x128x28x28xf16>) { + %4 = scf.for %arg8 = %c0 to %c28 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x128x28x28xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xi1> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: i1, %in_1: f16, %out: f16): + %7 = arith.select %in, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x128x28x28xf16> + scf.yield %inserted_slice : tensor<4x128x28x28xf16> + } + scf.yield %4 : tensor<4x128x28x28xf16> + } + scf.yield %3 : tensor<4x128x28x28xf16> + } + scf.yield %2 : tensor<4x128x28x28xf16> + } return %1 : tensor<4x128x28x28xf16> } - func.func private @Unknown114(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x128x28x28xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x128x28x28xf16> - return %1 : tensor<4x128x28x28xf16> - } - func.func private @Unknown121(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown127(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x64x56x56xf16> + %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x64x56x56xf16>) { + %2 = scf.for %arg5 = %c0 to %c64 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x64x56x56xf16>) { + %3 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x64x56x56xf16>) { + %4 = scf.for %arg9 = %c0 to %c56 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xi1> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor, tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16): + %7 = arith.addf %in, %in_2 : f16 + %8 = arith.select %in_3, %7, %cst : f16 + linalg.yield %8 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xf16> + scf.yield %inserted_slice : tensor<4x64x56x56xf16> + } + scf.yield %4 : tensor<4x64x56x56xf16> + } + scf.yield %3 : tensor<4x64x56x56xf16> + } + scf.yield %2 : tensor<4x64x56x56xf16> + } return %1 : tensor<4x64x56x56xf16> } - func.func private @Unknown125(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown131(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x64x56x56xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x56x56xf16>) { + %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x56x56xf16>) { + %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x56x56xf16>) { + %4 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xi1> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: i1, %in_1: f16, %out: f16): + %7 = arith.select %in, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xf16> + scf.yield %inserted_slice : tensor<4x64x56x56xf16> + } + scf.yield %4 : tensor<4x64x56x56xf16> + } + scf.yield %3 : tensor<4x64x56x56xf16> + } + scf.yield %2 : tensor<4x64x56x56xf16> + } return %1 : tensor<4x64x56x56xf16> } - func.func private @Unknown129(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + func.func private @Unknown143(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c56 = arith.constant 56 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %2 = arith.addf %in_0, %in_1 : f16 - %3 = arith.select %in, %2, %cst : f16 - linalg.yield %3 : f16 - } -> tensor<4x64x56x56xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x56x56xf16>) { + %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x56x56xf16>) { + %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x56x56xf16>) { + %4 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x56x56xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %in_1: f16, %out: f16): + %7 = arith.addf %in, %in_1 : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x56x56xf16> + scf.yield %inserted_slice : tensor<4x64x56x56xf16> + } + scf.yield %4 : tensor<4x64x56x56xf16> + } + scf.yield %3 : tensor<4x64x56x56xf16> + } + scf.yield %2 : tensor<4x64x56x56xf16> + } return %1 : tensor<4x64x56x56xf16> } - func.func private @Unknown133(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @Unknown137(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<4x64x56x56xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %2 = arith.addf %in, %in_0 : f16 - linalg.yield %2 : f16 - } -> tensor<4x64x56x56xf16> - return %1 : tensor<4x64x56x56xf16> - } - func.func private @Unknown138(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown144(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + %c112 = arith.constant 112 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %0 = tensor.empty() : tensor<4x64x112x112xf16> - %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) outs(%0 : tensor<4x64x112x112xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %2 = arith.select %in, %in_0, %cst : f16 - linalg.yield %2 : f16 - } -> tensor<4x64x112x112xf16> + %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x112x112xf16>) { + %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x112x112xf16>) { + %3 = scf.for %arg6 = %c0 to %c112 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x112x112xf16>) { + %4 = scf.for %arg8 = %c0 to %c112 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x112x112xf16>) { + %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xi1> to tensor + %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor, tensor) outs(%5 : tensor) { + ^bb0(%in: i1, %in_1: f16, %out: f16): + %7 = arith.select %in, %in_1, %cst : f16 + linalg.yield %7 : f16 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<4x64x112x112xf16> + scf.yield %inserted_slice : tensor<4x64x112x112xf16> + } + scf.yield %4 : tensor<4x64x112x112xf16> + } + scf.yield %3 : tensor<4x64x112x112xf16> + } + scf.yield %2 : tensor<4x64x112x112xf16> + } return %1 : tensor<4x64x112x112xf16> } - func.func private @Unknown141(%arg0: tensor) -> tensor attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown147(%arg0: tensor<4x1000xf16>, %arg1: tensor<4x1000xf32>) -> tensor attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %cst_0 = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor + %collapsed = tensor.collapse_shape %arg0 [[0, 1]] : tensor<4x1000xf16> into tensor<4000xf16> + %collapsed_1 = tensor.collapse_shape %arg1 [[0, 1]] : tensor<4x1000xf32> into tensor<4000xf32> + %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<4000xf16> into tensor<32x125xf16> + %expanded_2 = tensor.expand_shape %collapsed_1 [[0, 1]] : tensor<4000xf32> into tensor<32x125xf32> + %1 = tensor.empty() : tensor<32xf32> + %2 = scf.forall (%arg2) in (32) shared_outs(%arg3 = %1) -> (tensor<32xf32>) { + %extracted_slice = tensor.extract_slice %expanded[%arg2, 0] [1, 125] [1, 1] : tensor<32x125xf16> to tensor<125xf16> + %expanded_3 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<125xf16> into tensor<1x125xf16> + %extracted_slice_4 = tensor.extract_slice %expanded_2[%arg2, 0] [1, 125] [1, 1] : tensor<32x125xf32> to tensor<125xf32> + %expanded_5 = tensor.expand_shape %extracted_slice_4 [[0, 1]] : tensor<125xf32> into tensor<1x125xf32> + %extracted_slice_6 = tensor.extract_slice %arg3[%arg2] [1] [1] : tensor<32xf32> to tensor + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<128xf32> + %5 = scf.forall (%arg4) in (128) shared_outs(%arg5 = %4) -> (tensor<128xf32>) { + %19 = affine.min #map8(%arg4) + %20 = affine.min #map9(%arg4) + %21 = affine.apply #map3(%20, %19) + %extracted_slice_13 = tensor.extract_slice %expanded_3[0, %19] [1, %21] [1, 1] : tensor<1x125xf16> to tensor + %expanded_14 = tensor.expand_shape %extracted_slice_13 [[0, 1]] : tensor into tensor<1x?xf16> + %extracted_slice_15 = tensor.extract_slice %expanded_5[0, %19] [1, %21] [1, 1] : tensor<1x125xf32> to tensor + %expanded_16 = tensor.expand_shape %extracted_slice_15 [[0, 1]] : tensor into tensor<1x?xf32> + %dim = tensor.dim %extracted_slice_13, %c0 : tensor + %22 = arith.cmpi ugt, %dim, %c0 : index + %23 = scf.if %22 -> (f16) { + %extracted = tensor.extract %expanded_14[%c0, %c0] : tensor<1x?xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %dim_17 = tensor.dim %extracted_slice_15, %c0 : tensor + %24 = arith.cmpi ugt, %dim_17, %c0 : index + %25 = scf.if %24 -> (f32) { + %extracted = tensor.extract %expanded_16[%c0, %c0] : tensor<1x?xf32> + scf.yield %extracted : f32 + } else { + scf.yield %cst_0 : f32 + } + %26 = arith.extf %23 : f16 to f32 + %27 = arith.mulf %26, %25 : f32 + %28 = arith.addf %27, %cst_0 : f32 + %extracted_slice_18 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<128xf32> to tensor + %inserted = tensor.insert %28 into %extracted_slice_18[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<128xf32> + } + } {mapping = [#gpu.thread]} + %expanded_7 = tensor.expand_shape %5 [[0, 1]] : tensor<128xf32> into tensor<64x2xf32> + %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<64xf32> + %7 = scf.forall (%arg4) in (64) shared_outs(%arg5 = %6) -> (tensor<64xf32>) { + %extracted = tensor.extract %expanded_7[%arg4, %c0] : tensor<64x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_7[%arg4, %c1] : tensor<64x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<64xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<64xf32> + } + } {mapping = [#gpu.thread]} + %expanded_8 = tensor.expand_shape %7 [[0, 1]] : tensor<64xf32> into tensor<32x2xf32> + %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf32> + %9 = scf.forall (%arg4) in (32) shared_outs(%arg5 = %8) -> (tensor<32xf32>) { + %extracted = tensor.extract %expanded_8[%arg4, %c0] : tensor<32x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_8[%arg4, %c1] : tensor<32x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<32xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<32xf32> + } + } {mapping = [#gpu.thread]} + %expanded_9 = tensor.expand_shape %9 [[0, 1]] : tensor<32xf32> into tensor<16x2xf32> + %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf32> + %11 = scf.forall (%arg4) in (16) shared_outs(%arg5 = %10) -> (tensor<16xf32>) { + %extracted = tensor.extract %expanded_9[%arg4, %c0] : tensor<16x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_9[%arg4, %c1] : tensor<16x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<16xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<16xf32> + } + } {mapping = [#gpu.thread]} + %expanded_10 = tensor.expand_shape %11 [[0, 1]] : tensor<16xf32> into tensor<8x2xf32> + %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf32> + %13 = scf.forall (%arg4) in (8) shared_outs(%arg5 = %12) -> (tensor<8xf32>) { + %extracted = tensor.extract %expanded_10[%arg4, %c0] : tensor<8x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_10[%arg4, %c1] : tensor<8x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<8xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<8xf32> + } + } {mapping = [#gpu.thread]} + %expanded_11 = tensor.expand_shape %13 [[0, 1]] : tensor<8xf32> into tensor<4x2xf32> + %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf32> + %15 = scf.forall (%arg4) in (4) shared_outs(%arg5 = %14) -> (tensor<4xf32>) { + %extracted = tensor.extract %expanded_11[%arg4, %c0] : tensor<4x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_11[%arg4, %c1] : tensor<4x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<4xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<4xf32> + } + } {mapping = [#gpu.thread]} + %expanded_12 = tensor.expand_shape %15 [[0, 1]] : tensor<4xf32> into tensor<2x2xf32> + %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf32> + %17 = scf.forall (%arg4) in (2) shared_outs(%arg5 = %16) -> (tensor<2xf32>) { + %extracted = tensor.extract %expanded_12[%arg4, %c0] : tensor<2x2xf32> + %19 = arith.addf %extracted, %cst_0 : f32 + %extracted_13 = tensor.extract %expanded_12[%arg4, %c1] : tensor<2x2xf32> + %20 = arith.addf %extracted_13, %19 : f32 + %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<2xf32> to tensor + %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<2xf32> + } + } {mapping = [#gpu.thread]} + %18 = scf.forall (%arg4) in (1) shared_outs(%arg5 = %extracted_slice_6) -> (tensor) { + %19 = affine.apply #map4(%arg4) + %extracted = tensor.extract %17[%19] : tensor<2xf32> + %20 = arith.addf %extracted, %cst_0 : f32 + %21 = affine.apply #map5(%arg4) + %extracted_13 = tensor.extract %17[%21] : tensor<2xf32> + %22 = arith.addf %extracted_13, %20 : f32 + %inserted = tensor.insert %22 into %arg5[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %18 into %arg3[%arg2] [1] [1] : tensor into tensor<32xf32> + } + } {mapping = [#gpu.block]} + %3 = scf.forall (%arg2) in (1) shared_outs(%arg3 = %0) -> (tensor) { + %4 = affine.apply #map10(%arg2) + %extracted_slice = tensor.extract_slice %2[%4] [32] [1] : tensor<32xf32> to tensor<32xf32> + %expanded_3 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<32xf32> into tensor<32x1xf32> + %5 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf32> + %6 = scf.forall (%arg4) in (32) shared_outs(%arg5 = %5) -> (tensor<32xf32>) { + %extracted = tensor.extract %expanded_3[%arg4, %c0] : tensor<32x1xf32> + %16 = arith.addf %extracted, %cst_0 : f32 + %extracted_slice_8 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<32xf32> to tensor + %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<32xf32> + } + } {mapping = [#gpu.thread]} + %expanded_4 = tensor.expand_shape %6 [[0, 1]] : tensor<32xf32> into tensor<16x2xf32> + %7 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16xf32> + %8 = scf.forall (%arg4) in (16) shared_outs(%arg5 = %7) -> (tensor<16xf32>) { + %extracted = tensor.extract %expanded_4[%arg4, %c0] : tensor<16x2xf32> + %16 = arith.addf %extracted, %cst_0 : f32 + %extracted_8 = tensor.extract %expanded_4[%arg4, %c1] : tensor<16x2xf32> + %17 = arith.addf %extracted_8, %16 : f32 + %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<16xf32> to tensor + %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<16xf32> + } + } {mapping = [#gpu.thread]} + %expanded_5 = tensor.expand_shape %8 [[0, 1]] : tensor<16xf32> into tensor<8x2xf32> + %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<8xf32> + %10 = scf.forall (%arg4) in (8) shared_outs(%arg5 = %9) -> (tensor<8xf32>) { + %extracted = tensor.extract %expanded_5[%arg4, %c0] : tensor<8x2xf32> + %16 = arith.addf %extracted, %cst_0 : f32 + %extracted_8 = tensor.extract %expanded_5[%arg4, %c1] : tensor<8x2xf32> + %17 = arith.addf %extracted_8, %16 : f32 + %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<8xf32> to tensor + %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<8xf32> + } + } {mapping = [#gpu.thread]} + %expanded_6 = tensor.expand_shape %10 [[0, 1]] : tensor<8xf32> into tensor<4x2xf32> + %11 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<4xf32> + %12 = scf.forall (%arg4) in (4) shared_outs(%arg5 = %11) -> (tensor<4xf32>) { + %extracted = tensor.extract %expanded_6[%arg4, %c0] : tensor<4x2xf32> + %16 = arith.addf %extracted, %cst_0 : f32 + %extracted_8 = tensor.extract %expanded_6[%arg4, %c1] : tensor<4x2xf32> + %17 = arith.addf %extracted_8, %16 : f32 + %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<4xf32> to tensor + %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<4xf32> + } + } {mapping = [#gpu.thread]} + %expanded_7 = tensor.expand_shape %12 [[0, 1]] : tensor<4xf32> into tensor<2x2xf32> + %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2xf32> + %14 = scf.forall (%arg4) in (2) shared_outs(%arg5 = %13) -> (tensor<2xf32>) { + %extracted = tensor.extract %expanded_7[%arg4, %c0] : tensor<2x2xf32> + %16 = arith.addf %extracted, %cst_0 : f32 + %extracted_8 = tensor.extract %expanded_7[%arg4, %c1] : tensor<2x2xf32> + %17 = arith.addf %extracted_8, %16 : f32 + %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<2xf32> to tensor + %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor into tensor<2xf32> + } + } {mapping = [#gpu.thread]} + %15 = scf.forall (%arg4) in (1) shared_outs(%arg5 = %arg3) -> (tensor) { + %16 = affine.apply #map4(%arg4) + %extracted = tensor.extract %14[%16] : tensor<2xf32> + %17 = arith.addf %extracted, %cst_0 : f32 + %18 = affine.apply #map5(%arg4) + %extracted_8 = tensor.extract %14[%18] : tensor<2xf32> + %19 = arith.addf %extracted_8, %17 : f32 + %inserted = tensor.insert %19 into %arg5[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[] [] [] : tensor into tensor + } + } {mapping = [#gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %15 into %arg3[] [] [] : tensor into tensor + } + } {mapping = [#gpu.block]} + return %3 : tensor + } + func.func private @Unknown148(%arg0: tensor) -> tensor attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 4.000000e+00 : f32 %0 = tensor.empty() : tensor - %1 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = []} ins(%arg0 : tensor) outs(%0 : tensor) { + %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%arg0 : tensor) outs(%0 : tensor) { ^bb0(%in: f32, %out: f32): %2 = arith.negf %in : f32 %3 = arith.divf %2, %cst : f32 @@ -677,203 +2019,400 @@ module @IrToMhlo.2452 { } -> tensor return %1 : tensor } - func.func private @Unknown142(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown149(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x3x7x7xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf16>) outs(%0 : tensor<64x3x7x7xf32>) attrs = {xla_shape = "f32[64,3,7,7]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x3x7x7xf32> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf32>) { + %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf32>) { + %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf32>) { + %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x3x7x7xf32> + scf.yield %inserted_slice : tensor<64x3x7x7xf32> + } + scf.yield %4 : tensor<64x3x7x7xf32> + } + scf.yield %3 : tensor<64x3x7x7xf32> + } + scf.yield %2 : tensor<64x3x7x7xf32> + } return %1 : tensor<64x3x7x7xf32> } - func.func private @Unknown143(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown150(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<64x64x3x3xf32> + scf.yield %inserted_slice : tensor<64x64x3x3xf32> + } + scf.yield %4 : tensor<64x64x3x3xf32> + } + scf.yield %3 : tensor<64x64x3x3xf32> + } + scf.yield %2 : tensor<64x64x3x3xf32> + } return %1 : tensor<64x64x3x3xf32> } - func.func private @Unknown144(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown145(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown146(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<64x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<64x64x3x3xf32> - return %1 : tensor<64x64x3x3xf32> - } - func.func private @Unknown147(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown154(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf16>) outs(%0 : tensor<128x64x3x3xf32>) attrs = {xla_shape = "f32[128,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x64x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x3x3xf32> + scf.yield %inserted_slice : tensor<128x64x3x3xf32> + } + scf.yield %4 : tensor<128x64x3x3xf32> + } + scf.yield %3 : tensor<128x64x3x3xf32> + } + scf.yield %2 : tensor<128x64x3x3xf32> + } return %1 : tensor<128x64x3x3xf32> } - func.func private @Unknown148(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown155(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x128x3x3xf32> + scf.yield %inserted_slice : tensor<128x128x3x3xf32> + } + scf.yield %4 : tensor<128x128x3x3xf32> + } + scf.yield %3 : tensor<128x128x3x3xf32> + } + scf.yield %2 : tensor<128x128x3x3xf32> + } return %1 : tensor<128x128x3x3xf32> } - func.func private @Unknown149(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown156(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<128x64x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf16>) outs(%0 : tensor<128x64x1x1xf32>) attrs = {xla_shape = "f32[128,64,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x64x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<128x64x1x1xf32> + scf.yield %inserted_slice : tensor<128x64x1x1xf32> + } + scf.yield %2 : tensor<128x64x1x1xf32> + } return %1 : tensor<128x64x1x1xf32> } - func.func private @Unknown150(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> - return %1 : tensor<128x128x3x3xf32> - } - func.func private @Unknown151(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<128x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<128x128x3x3xf32> - return %1 : tensor<128x128x3x3xf32> - } - func.func private @Unknown152(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown159(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf16>) outs(%0 : tensor<256x128x3x3xf32>) attrs = {xla_shape = "f32[256,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x128x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x3x3xf32> + scf.yield %inserted_slice : tensor<256x128x3x3xf32> + } + scf.yield %4 : tensor<256x128x3x3xf32> + } + scf.yield %3 : tensor<256x128x3x3xf32> + } + scf.yield %2 : tensor<256x128x3x3xf32> + } return %1 : tensor<256x128x3x3xf32> } - func.func private @Unknown153(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown160(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x256x3x3xf32> + scf.yield %inserted_slice : tensor<256x256x3x3xf32> + } + scf.yield %4 : tensor<256x256x3x3xf32> + } + scf.yield %3 : tensor<256x256x3x3xf32> + } + scf.yield %2 : tensor<256x256x3x3xf32> + } return %1 : tensor<256x256x3x3xf32> } - func.func private @Unknown154(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown161(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<256x128x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf16>) outs(%0 : tensor<256x128x1x1xf32>) attrs = {xla_shape = "f32[256,128,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x128x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<256x128x1x1xf32> + scf.yield %inserted_slice : tensor<256x128x1x1xf32> + } + scf.yield %2 : tensor<256x128x1x1xf32> + } return %1 : tensor<256x128x1x1xf32> } - func.func private @Unknown155(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> - return %1 : tensor<256x256x3x3xf32> - } - func.func private @Unknown156(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<256x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<256x256x3x3xf32> - return %1 : tensor<256x256x3x3xf32> - } - func.func private @Unknown157(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown164(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf16>) outs(%0 : tensor<512x256x3x3xf32>) attrs = {xla_shape = "f32[512,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x256x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x3x3xf32> + scf.yield %inserted_slice : tensor<512x256x3x3xf32> + } + scf.yield %4 : tensor<512x256x3x3xf32> + } + scf.yield %3 : tensor<512x256x3x3xf32> + } + scf.yield %2 : tensor<512x256x3x3xf32> + } return %1 : tensor<512x256x3x3xf32> } - func.func private @Unknown158(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown165(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf32>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf32>) { + %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf32>) { + %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf16> to tensor + %5 = tensor.empty() : tensor + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%5 : tensor) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x512x3x3xf32> + scf.yield %inserted_slice : tensor<512x512x3x3xf32> + } + scf.yield %4 : tensor<512x512x3x3xf32> + } + scf.yield %3 : tensor<512x512x3x3xf32> + } + scf.yield %2 : tensor<512x512x3x3xf32> + } return %1 : tensor<512x512x3x3xf32> } - func.func private @Unknown159(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown166(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<512x256x1x1xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf16>) outs(%0 : tensor<512x256x1x1xf32>) attrs = {xla_shape = "f32[512,256,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x256x1x1xf32> + %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf32>) { + %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor into tensor<512x256x1x1xf32> + scf.yield %inserted_slice : tensor<512x256x1x1xf32> + } + scf.yield %2 : tensor<512x256x1x1xf32> + } return %1 : tensor<512x256x1x1xf32> } - func.func private @Unknown160(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> - return %1 : tensor<512x512x3x3xf32> - } - func.func private @Unknown161(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %0 = tensor.empty() : tensor<512x512x3x3xf32> - %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<512x512x3x3xf32> - return %1 : tensor<512x512x3x3xf32> - } - func.func private @Unknown163(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown170(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index %0 = tensor.empty() : tensor<1000x512xf32> - %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf16>) outs(%0 : tensor<1000x512xf32>) attrs = {xla_shape = "f32[1000,512]{0,1}"} { - ^bb0(%in: f16, %out: f32): - %2 = arith.extf %in : f16 to f32 - linalg.yield %2 : f32 - } -> tensor<1000x512xf32> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf32>) { + %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf16> to tensor + %3 = tensor.empty() : tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%3 : tensor) { + ^bb0(%in: f16, %out: f32): + %5 = arith.extf %in : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor into tensor<1000x512xf32> + scf.yield %inserted_slice : tensor<1000x512xf32> + } + scf.yield %2 : tensor<1000x512xf32> + } return %1 : tensor<1000x512xf32> } - func.func private @Unknown164(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown171(%arg0: tensor<4x1000xf16>) -> tensor<1000xf32> attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %cst_0 = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<1000xf32> - %1 = linalg.generic {indexing_maps = [#map6, #map6], iterator_types = ["parallel"]} ins(%arg0 : tensor<1000xf32>) outs(%0 : tensor<1000xf32>) { - ^bb0(%in: f32, %out: f32): - %2 = arith.truncf %in : f32 to f16 - %3 = arith.extf %2 : f16 to f32 - linalg.yield %3 : f32 - } -> tensor<1000xf32> + %1 = scf.forall (%arg1) in (32) shared_outs(%arg2 = %0) -> (tensor<1000xf32>) { + %2 = affine.min #map11(%arg1) + %3 = affine.apply #map10(%arg1) + %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32xf32> + %5 = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<2x32xf32> + %6 = scf.forall (%arg3, %arg4) in (2, 32) shared_outs(%arg5 = %5) -> (tensor<2x32xf32>) { + %8 = affine.min #map12(%arg4, %arg1) + %9 = affine.min #map13(%arg4, %arg1) + %10 = affine.apply #map3(%9, %8) + %11 = arith.cmpi ugt, %10, %c0 : index + %12 = scf.if %11 -> (f16) { + %19 = affine.apply #map4(%arg3) + %20 = affine.apply #map14(%arg1)[%8] + %extracted = tensor.extract %arg0[%19, %20] : tensor<4x1000xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %13 = arith.extf %12 : f16 to f32 + %14 = arith.addf %13, %cst_0 : f32 + %15 = arith.cmpi ugt, %10, %c0 : index + %16 = scf.if %15 -> (f16) { + %19 = affine.apply #map5(%arg3) + %20 = affine.apply #map14(%arg1)[%8] + %extracted = tensor.extract %arg0[%19, %20] : tensor<4x1000xf16> + scf.yield %extracted : f16 + } else { + scf.yield %cst : f16 + } + %17 = arith.extf %16 : f16 to f32 + %18 = arith.addf %14, %17 : f32 + %extracted_slice_1 = tensor.extract_slice %arg5[%arg3, %arg4] [1, 1] [1, 1] : tensor<2x32xf32> to tensor + %inserted = tensor.insert %18 into %extracted_slice_1[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg5[%arg3, %arg4] [1, 1] [1, 1] : tensor into tensor<2x32xf32> + } + } {mapping = [#gpu.thread, #gpu.thread]} + %7 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf32>) { + %extracted = tensor.extract %6[%c0, %arg3] : tensor<2x32xf32> + %8 = arith.addf %extracted, %cst_0 : f32 + %extracted_1 = tensor.extract %6[%c1, %arg3] : tensor<2x32xf32> + %9 = arith.addf %extracted_1, %8 : f32 + %extracted_slice_2 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf32> to tensor + %inserted = tensor.insert %9 into %extracted_slice_2[] : tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor into tensor<32xf32> + } + } {mapping = [#gpu.thread]} + %extracted_slice = tensor.extract_slice %7[0] [%2] [1] : tensor<32xf32> to tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %extracted_slice into %arg2[%3] [%2] [1] : tensor into tensor<1000xf32> + } + } {mapping = [#gpu.block]} + return %1 : tensor<1000xf32> + } + func.func private @Unknown172(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index + %0 = tensor.empty() : tensor<1000xf32> + %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000xf32>) { + %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<1000xf32> to tensor + %2 = tensor.empty() : tensor + %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor) outs(%2 : tensor) { + ^bb0(%in: f32, %out: f32): + %4 = arith.truncf %in : f32 to f16 + %5 = arith.extf %4 : f16 to f32 + linalg.yield %5 : f32 + } -> tensor + %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor into tensor<1000xf32> + scf.yield %inserted_slice : tensor<1000xf32> + } return %1 : tensor<1000xf32> } func.func @main(%arg0: tensor<4x3x224x224xf32>, %arg1: tensor<4x1000xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64x64x3x3xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<64xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64x64x3x3xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64xf32>, %arg16: tensor<64xf32>, %arg17: tensor<64x64x3x3xf32>, %arg18: tensor<64xf32>, %arg19: tensor<64xf32>, %arg20: tensor<64xf32>, %arg21: tensor<64xf32>, %arg22: tensor<64x64x3x3xf32>, %arg23: tensor<64xf32>, %arg24: tensor<64xf32>, %arg25: tensor<64xf32>, %arg26: tensor<64xf32>, %arg27: tensor<128x64x3x3xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128xf32>, %arg31: tensor<128xf32>, %arg32: tensor<128x128x3x3xf32>, %arg33: tensor<128xf32>, %arg34: tensor<128xf32>, %arg35: tensor<128xf32>, %arg36: tensor<128xf32>, %arg37: tensor<128x64x1x1xf32>, %arg38: tensor<128xf32>, %arg39: tensor<128xf32>, %arg40: tensor<128xf32>, %arg41: tensor<128xf32>, %arg42: tensor<128x128x3x3xf32>, %arg43: tensor<128xf32>, %arg44: tensor<128xf32>, %arg45: tensor<128xf32>, %arg46: tensor<128xf32>, %arg47: tensor<128x128x3x3xf32>, %arg48: tensor<128xf32>, %arg49: tensor<128xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<256x128x3x3xf32>, %arg53: tensor<256xf32>, %arg54: tensor<256xf32>, %arg55: tensor<256xf32>, %arg56: tensor<256xf32>, %arg57: tensor<256x256x3x3xf32>, %arg58: tensor<256xf32>, %arg59: tensor<256xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256x128x1x1xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256x256x3x3xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<256xf32>, %arg71: tensor<256xf32>, %arg72: tensor<256x256x3x3xf32>, %arg73: tensor<256xf32>, %arg74: tensor<256xf32>, %arg75: tensor<256xf32>, %arg76: tensor<256xf32>, %arg77: tensor<512x256x3x3xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<512xf32>, %arg81: tensor<512xf32>, %arg82: tensor<512x512x3x3xf32>, %arg83: tensor<512xf32>, %arg84: tensor<512xf32>, %arg85: tensor<512xf32>, %arg86: tensor<512xf32>, %arg87: tensor<512x256x1x1xf32>, %arg88: tensor<512xf32>, %arg89: tensor<512xf32>, %arg90: tensor<512xf32>, %arg91: tensor<512xf32>, %arg92: tensor<512x512x3x3xf32>, %arg93: tensor<512xf32>, %arg94: tensor<512xf32>, %arg95: tensor<512xf32>, %arg96: tensor<512xf32>, %arg97: tensor<512x512x3x3xf32>, %arg98: tensor<512xf32>, %arg99: tensor<512xf32>, %arg100: tensor<512xf32>, %arg101: tensor<512xf32>, %arg102: tensor<1000x512xf32>, %arg103: tensor<1000xf32>) -> (tensor, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>) attributes {__placeholder__byre.entry_point} { @@ -884,51 +2423,51 @@ module @IrToMhlo.2452 { %4 = tensor.empty() : tensor<4x64x112x112xf16> %5 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%3, %arg3, %arg4 : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) outs(%4 : tensor<4x64x112x112xf16>) : tensor<4x64x112x112xf16> %6 = call @Unknown3(%arg7) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %7 = call @Unknown4(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %8 = call @Unknown5(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> - %9 = call @Unknown6(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %7 = call @Unknown3(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %8 = call @Unknown3(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> + %9 = call @Unknown3(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> %10 = call @Unknown7(%arg37) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> %11 = call @Unknown8(%arg27) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> %12 = call @Unknown9(%arg32) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %13 = call @Unknown10(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> - %14 = call @Unknown11(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %13 = call @Unknown9(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> + %14 = call @Unknown9(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> %15 = call @Unknown12(%arg62) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> %16 = call @Unknown13(%arg52) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> %17 = call @Unknown14(%arg57) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %18 = call @Unknown15(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> - %19 = call @Unknown16(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %18 = call @Unknown14(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> + %19 = call @Unknown14(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> %20 = call @Unknown17(%arg87) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> %21 = call @Unknown18(%arg77) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> %22 = call @Unknown19(%arg82) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %23 = call @Unknown20(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> - %24 = call @Unknown21(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %23 = call @Unknown19(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> + %24 = call @Unknown19(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> %25 = call @Unknown22(%arg1) : (tensor<4x1000xf32>) -> tensor<4x1000xf16> %26 = call @Unknown23(%arg102) : (tensor<1000x512xf32>) -> tensor<1000x512xf16> - %27 = tensor.empty() : tensor<4xf16> - %28 = byre.compute_on_tensor @ReduceSumOp_f16_f16 {dimensions = dense<1> : tensor<1xi64>} ins(%25 : tensor<4x1000xf16>) outs(%27 : tensor<4xf16>) : tensor<4xf16> - %29:2 = call @Unknown24(%5) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) + %27 = call @Unknown24(%arg103) : (tensor<1000xf32>) -> tensor<1000xf16> + %28 = call @Unknown25(%25) : (tensor<4x1000xf16>) -> tensor<4xf16> + %29:2 = call @Unknown26(%5) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) %30 = tensor.empty() : tensor<4x64x56x56xf16> %31 = byre.compute_on_tensor @PoolMaxOp_f16_f16 {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} ins(%29#0 : tensor<4x64x112x112xf16>) outs(%30 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> %32 = tensor.empty() : tensor<4x64x56x56xf16> %33 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%31, %6 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%32 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> %34 = tensor.empty() : tensor<4x64x56x56xf16> %35 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%33, %arg8, %arg9 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%34 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> - %36:2 = call @Unknown26(%35) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %36:2 = call @Unknown28(%35) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) %37 = tensor.empty() : tensor<4x64x56x56xf16> %38 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%36#0, %7 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%37 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> %39 = tensor.empty() : tensor<4x64x56x56xf16> %40 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%38, %arg13, %arg14 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%39 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> - %41:2 = call @Unknown28(%40, %31) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %41:2 = call @Unknown30(%40, %31) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) %42 = tensor.empty() : tensor<4x64x56x56xf16> %43 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%41#0, %8 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%42 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> %44 = tensor.empty() : tensor<4x64x56x56xf16> %45 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%43, %arg18, %arg19 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%44 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> - %46:2 = call @Unknown30(%45) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %46:2 = call @Unknown28(%45) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) %47 = tensor.empty() : tensor<4x64x56x56xf16> %48 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%46#0, %9 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%47 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> %49 = tensor.empty() : tensor<4x64x56x56xf16> %50 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%48, %arg23, %arg24 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%49 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> - %51:2 = call @Unknown32(%50, %41#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) + %51:2 = call @Unknown30(%50, %41#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) %52 = tensor.empty() : tensor<4x128x28x28xf16> %53 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %10 : tensor<4x64x56x56xf16>, tensor<128x64x1x1xf16>) outs(%52 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> %54 = tensor.empty() : tensor<4x128x28x28xf16> @@ -937,22 +2476,22 @@ module @IrToMhlo.2452 { %57 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %11 : tensor<4x64x56x56xf16>, tensor<128x64x3x3xf16>) outs(%56 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> %58 = tensor.empty() : tensor<4x128x28x28xf16> %59 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%57, %arg28, %arg29 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%58 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> - %60:2 = call @Unknown35(%59) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %60:2 = call @Unknown37(%59) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) %61 = tensor.empty() : tensor<4x128x28x28xf16> %62 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%60#0, %12 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%61 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> %63 = tensor.empty() : tensor<4x128x28x28xf16> %64 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%62, %arg33, %arg34 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%63 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> - %65:2 = call @Unknown37(%64, %55) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %65:2 = call @Unknown39(%64, %55) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) %66 = tensor.empty() : tensor<4x128x28x28xf16> %67 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%65#0, %13 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%66 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> %68 = tensor.empty() : tensor<4x128x28x28xf16> %69 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%67, %arg43, %arg44 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%68 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> - %70:2 = call @Unknown39(%69) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %70:2 = call @Unknown37(%69) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) %71 = tensor.empty() : tensor<4x128x28x28xf16> %72 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%70#0, %14 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%71 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> %73 = tensor.empty() : tensor<4x128x28x28xf16> %74 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%72, %arg48, %arg49 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%73 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> - %75:2 = call @Unknown41(%74, %65#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) + %75:2 = call @Unknown39(%74, %65#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) %76 = tensor.empty() : tensor<4x256x14x14xf16> %77 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %15 : tensor<4x128x28x28xf16>, tensor<256x128x1x1xf16>) outs(%76 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> %78 = tensor.empty() : tensor<4x256x14x14xf16> @@ -961,22 +2500,22 @@ module @IrToMhlo.2452 { %81 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %16 : tensor<4x128x28x28xf16>, tensor<256x128x3x3xf16>) outs(%80 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> %82 = tensor.empty() : tensor<4x256x14x14xf16> %83 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%81, %arg53, %arg54 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%82 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> - %84:2 = call @Unknown44(%83) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %84:2 = call @Unknown46(%83) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) %85 = tensor.empty() : tensor<4x256x14x14xf16> %86 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%84#0, %17 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%85 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> %87 = tensor.empty() : tensor<4x256x14x14xf16> %88 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%86, %arg58, %arg59 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%87 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> - %89:2 = call @Unknown46(%88, %79) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %89:2 = call @Unknown48(%88, %79) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) %90 = tensor.empty() : tensor<4x256x14x14xf16> %91 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%89#0, %18 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%90 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> %92 = tensor.empty() : tensor<4x256x14x14xf16> %93 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%91, %arg68, %arg69 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%92 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> - %94:2 = call @Unknown48(%93) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %94:2 = call @Unknown46(%93) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) %95 = tensor.empty() : tensor<4x256x14x14xf16> %96 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%94#0, %19 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%95 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> %97 = tensor.empty() : tensor<4x256x14x14xf16> %98 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%96, %arg73, %arg74 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%97 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> - %99:2 = call @Unknown50(%98, %89#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) + %99:2 = call @Unknown48(%98, %89#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) %100 = tensor.empty() : tensor<4x512x7x7xf16> %101 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %20 : tensor<4x256x14x14xf16>, tensor<512x256x1x1xf16>) outs(%100 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> %102 = tensor.empty() : tensor<4x512x7x7xf16> @@ -985,243 +2524,239 @@ module @IrToMhlo.2452 { %105 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %21 : tensor<4x256x14x14xf16>, tensor<512x256x3x3xf16>) outs(%104 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> %106 = tensor.empty() : tensor<4x512x7x7xf16> %107 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%105, %arg78, %arg79 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%106 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> - %108:2 = call @Unknown53(%107) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %108:2 = call @Unknown55(%107) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) %109 = tensor.empty() : tensor<4x512x7x7xf16> %110 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%108#0, %22 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%109 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> %111 = tensor.empty() : tensor<4x512x7x7xf16> %112 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%110, %arg83, %arg84 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%111 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> - %113:2 = call @Unknown55(%112, %103) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %113:2 = call @Unknown57(%112, %103) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) %114 = tensor.empty() : tensor<4x512x7x7xf16> %115 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%113#0, %23 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%114 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> %116 = tensor.empty() : tensor<4x512x7x7xf16> %117 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%115, %arg93, %arg94 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%116 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> - %118:2 = call @Unknown57(%117) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %118:2 = call @Unknown55(%117) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) %119 = tensor.empty() : tensor<4x512x7x7xf16> %120 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%118#0, %24 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%119 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> %121 = tensor.empty() : tensor<4x512x7x7xf16> %122 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%120, %arg98, %arg99 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%121 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> - %123:2 = call @Unknown59(%122, %113#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) - %124 = tensor.empty() : tensor<4x512xf16> - %125 = byre.compute_on_tensor @ReduceSumOp_f16_f16 {dimensions = dense<[3, 2]> : tensor<2xi64>} ins(%123#0 : tensor<4x512x7x7xf16>) outs(%124 : tensor<4x512xf16>) : tensor<4x512xf16> - %126 = call @Unknown60(%125) : (tensor<4x512xf16>) -> tensor<4x512xf16> - %127 = tensor.empty() : tensor<4x1000xf16> - %128 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%126, %26 : tensor<4x512xf16>, tensor<1000x512xf16>) outs(%127 : tensor<4x1000xf16>) : tensor<4x1000xf16> - %129 = call @Unknown61(%arg103, %128) : (tensor<1000xf32>, tensor<4x1000xf16>) -> tensor<4x1000xf16> - %130 = tensor.empty() : tensor<4xf16> - %131 = byre.compute_on_tensor @ReduceMaxOp_f16_f16 {dimensions = dense<1> : tensor<1xi64>} ins(%129 : tensor<4x1000xf16>) outs(%130 : tensor<4xf16>) : tensor<4xf16> - %132:2 = call @Unknown62(%131, %129) : (tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) - %133 = tensor.empty() : tensor<4xf16> - %134 = byre.compute_on_tensor @ReduceSumOp_f16_f16 {dimensions = dense<1> : tensor<1xi64>} ins(%132#1 : tensor<4x1000xf16>) outs(%133 : tensor<4xf16>) : tensor<4xf16> - %135:3 = call @Unknown63(%134, %132#0, %28, %25, %arg1) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>, tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) - %136 = tensor.empty() : tensor<4x512xf16> - %137 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%135#0, %26 : tensor<4x1000xf16>, tensor<1000x512xf16>) outs(%136 : tensor<4x512xf16>) : tensor<4x512xf16> - %138 = call @Unknown64(%137, %123#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> - %139 = tensor.empty() : tensor<4x512x7x7xf16> - %140 = tensor.empty() : tensor<512xf32> - %141 = tensor.empty() : tensor<512xf32> - %142:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%120, %arg98, %138 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%139, %140, %141 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - %143 = tensor.empty() : tensor<4x512x7x7xf16> - %144 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%142#0, %24 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%143 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> - %145 = tensor.empty() : tensor<512x512x3x3xf16> - %146 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%118#0, %142#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%145 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16> - %147 = call @Unknown68(%118#1, %144) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> - %148 = tensor.empty() : tensor<4x512x7x7xf16> - %149 = tensor.empty() : tensor<512xf32> - %150 = tensor.empty() : tensor<512xf32> - %151:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%115, %arg93, %147 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%148, %149, %150 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - %152 = tensor.empty() : tensor<4x512x7x7xf16> - %153 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%151#0, %23 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%152 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> - %154 = tensor.empty() : tensor<512x512x3x3xf16> - %155 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%113#0, %151#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%154 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16> - %156 = call @Unknown72(%138, %153, %113#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> - %157 = tensor.empty() : tensor<4x512x7x7xf16> - %158 = tensor.empty() : tensor<512xf32> - %159 = tensor.empty() : tensor<512xf32> - %160:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%110, %arg83, %156 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%157, %158, %159 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - %161 = tensor.empty() : tensor<4x512x7x7xf16> - %162 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%160#0, %22 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%161 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> - %163 = tensor.empty() : tensor<512x512x3x3xf16> - %164 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%108#0, %160#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%163 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16> - %165 = call @Unknown76(%108#1, %162) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> - %166 = tensor.empty() : tensor<4x512x7x7xf16> - %167 = tensor.empty() : tensor<512xf32> - %168 = tensor.empty() : tensor<512xf32> - %169:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%105, %arg78, %165 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%166, %167, %168 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - %170 = tensor.empty() : tensor<4x256x14x14xf16> - %171 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%169#0, %21 : tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) outs(%170 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> - %172 = tensor.empty() : tensor<512x256x3x3xf16> - %173 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %169#0 : tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) outs(%172 : tensor<512x256x3x3xf16>) : tensor<512x256x3x3xf16> - %174 = tensor.empty() : tensor<4x512x7x7xf16> - %175 = tensor.empty() : tensor<512xf32> - %176 = tensor.empty() : tensor<512xf32> - %177:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%101, %arg88, %156 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%174, %175, %176 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> - %178 = tensor.empty() : tensor<4x256x14x14xf16> - %179 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%177#0, %20 : tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) outs(%178 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> - %180 = tensor.empty() : tensor<512x256x1x1xf16> - %181 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %177#0 : tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) outs(%180 : tensor<512x256x1x1xf16>) : tensor<512x256x1x1xf16> - %182 = call @Unknown83(%179, %171, %99#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> - %183 = tensor.empty() : tensor<4x256x14x14xf16> - %184 = tensor.empty() : tensor<256xf32> - %185 = tensor.empty() : tensor<256xf32> - %186:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%96, %arg73, %182 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%183, %184, %185 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - %187 = tensor.empty() : tensor<4x256x14x14xf16> - %188 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%186#0, %19 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%187 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> - %189 = tensor.empty() : tensor<256x256x3x3xf16> - %190 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%94#0, %186#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%189 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16> - %191 = call @Unknown87(%94#1, %188) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> - %192 = tensor.empty() : tensor<4x256x14x14xf16> - %193 = tensor.empty() : tensor<256xf32> - %194 = tensor.empty() : tensor<256xf32> - %195:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%91, %arg68, %191 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%192, %193, %194 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - %196 = tensor.empty() : tensor<4x256x14x14xf16> - %197 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%195#0, %18 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%196 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> - %198 = tensor.empty() : tensor<256x256x3x3xf16> - %199 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%89#0, %195#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%198 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16> - %200 = call @Unknown91(%182, %197, %89#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> - %201 = tensor.empty() : tensor<4x256x14x14xf16> - %202 = tensor.empty() : tensor<256xf32> - %203 = tensor.empty() : tensor<256xf32> - %204:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%86, %arg58, %200 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%201, %202, %203 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - %205 = tensor.empty() : tensor<4x256x14x14xf16> - %206 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%204#0, %17 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%205 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> - %207 = tensor.empty() : tensor<256x256x3x3xf16> - %208 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%84#0, %204#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%207 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16> - %209 = call @Unknown95(%84#1, %206) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> - %210 = tensor.empty() : tensor<4x256x14x14xf16> - %211 = tensor.empty() : tensor<256xf32> - %212 = tensor.empty() : tensor<256xf32> - %213:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%81, %arg53, %209 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%210, %211, %212 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - %214 = tensor.empty() : tensor<4x128x28x28xf16> - %215 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%213#0, %16 : tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) outs(%214 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> - %216 = tensor.empty() : tensor<256x128x3x3xf16> - %217 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %213#0 : tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) outs(%216 : tensor<256x128x3x3xf16>) : tensor<256x128x3x3xf16> - %218 = tensor.empty() : tensor<4x256x14x14xf16> - %219 = tensor.empty() : tensor<256xf32> - %220 = tensor.empty() : tensor<256xf32> - %221:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%77, %arg63, %200 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%218, %219, %220 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> - %222 = tensor.empty() : tensor<4x128x28x28xf16> - %223 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%221#0, %15 : tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) outs(%222 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> - %224 = tensor.empty() : tensor<256x128x1x1xf16> - %225 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %221#0 : tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) outs(%224 : tensor<256x128x1x1xf16>) : tensor<256x128x1x1xf16> - %226 = call @Unknown102(%223, %215, %75#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> - %227 = tensor.empty() : tensor<4x128x28x28xf16> - %228 = tensor.empty() : tensor<128xf32> - %229 = tensor.empty() : tensor<128xf32> - %230:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%72, %arg48, %226 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%227, %228, %229 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - %231 = tensor.empty() : tensor<4x128x28x28xf16> - %232 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%230#0, %14 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%231 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> - %233 = tensor.empty() : tensor<128x128x3x3xf16> - %234 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%70#0, %230#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%233 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16> - %235 = call @Unknown106(%70#1, %232) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> - %236 = tensor.empty() : tensor<4x128x28x28xf16> - %237 = tensor.empty() : tensor<128xf32> - %238 = tensor.empty() : tensor<128xf32> - %239:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%67, %arg43, %235 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%236, %237, %238 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - %240 = tensor.empty() : tensor<4x128x28x28xf16> - %241 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%239#0, %13 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%240 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> - %242 = tensor.empty() : tensor<128x128x3x3xf16> - %243 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%65#0, %239#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%242 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16> - %244 = call @Unknown110(%226, %241, %65#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> - %245 = tensor.empty() : tensor<4x128x28x28xf16> - %246 = tensor.empty() : tensor<128xf32> - %247 = tensor.empty() : tensor<128xf32> - %248:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%62, %arg33, %244 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%245, %246, %247 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - %249 = tensor.empty() : tensor<4x128x28x28xf16> - %250 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%248#0, %12 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%249 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> - %251 = tensor.empty() : tensor<128x128x3x3xf16> - %252 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%60#0, %248#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%251 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16> - %253 = call @Unknown114(%60#1, %250) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> - %254 = tensor.empty() : tensor<4x128x28x28xf16> - %255 = tensor.empty() : tensor<128xf32> - %256 = tensor.empty() : tensor<128xf32> - %257:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%57, %arg28, %253 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%254, %255, %256 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - %258 = tensor.empty() : tensor<4x64x56x56xf16> - %259 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%257#0, %11 : tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) outs(%258 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> - %260 = tensor.empty() : tensor<128x64x3x3xf16> - %261 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %257#0 : tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) outs(%260 : tensor<128x64x3x3xf16>) : tensor<128x64x3x3xf16> - %262 = tensor.empty() : tensor<4x128x28x28xf16> - %263 = tensor.empty() : tensor<128xf32> - %264 = tensor.empty() : tensor<128xf32> - %265:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%53, %arg38, %244 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%262, %263, %264 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> - %266 = tensor.empty() : tensor<4x64x56x56xf16> - %267 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%265#0, %10 : tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) outs(%266 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> - %268 = tensor.empty() : tensor<128x64x1x1xf16> - %269 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %265#0 : tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) outs(%268 : tensor<128x64x1x1xf16>) : tensor<128x64x1x1xf16> - %270 = call @Unknown121(%267, %259, %51#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> - %271 = tensor.empty() : tensor<4x64x56x56xf16> - %272 = tensor.empty() : tensor<64xf32> - %273 = tensor.empty() : tensor<64xf32> - %274:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%48, %arg23, %270 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%271, %272, %273 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - %275 = tensor.empty() : tensor<4x64x56x56xf16> - %276 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%274#0, %9 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%275 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> - %277 = tensor.empty() : tensor<64x64x3x3xf16> - %278 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%46#0, %274#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%277 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16> - %279 = call @Unknown125(%46#1, %276) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> - %280 = tensor.empty() : tensor<4x64x56x56xf16> - %281 = tensor.empty() : tensor<64xf32> - %282 = tensor.empty() : tensor<64xf32> - %283:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%43, %arg18, %279 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%280, %281, %282 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - %284 = tensor.empty() : tensor<4x64x56x56xf16> - %285 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%283#0, %8 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%284 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> - %286 = tensor.empty() : tensor<64x64x3x3xf16> - %287 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%41#0, %283#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%286 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16> - %288 = call @Unknown129(%270, %285, %41#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> - %289 = tensor.empty() : tensor<4x64x56x56xf16> - %290 = tensor.empty() : tensor<64xf32> - %291 = tensor.empty() : tensor<64xf32> - %292:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%38, %arg13, %288 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%289, %290, %291 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - %293 = tensor.empty() : tensor<4x64x56x56xf16> - %294 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%292#0, %7 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%293 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> - %295 = tensor.empty() : tensor<64x64x3x3xf16> - %296 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%36#0, %292#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%295 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16> - %297 = call @Unknown133(%36#1, %294) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> - %298 = tensor.empty() : tensor<4x64x56x56xf16> - %299 = tensor.empty() : tensor<64xf32> - %300 = tensor.empty() : tensor<64xf32> - %301:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%33, %arg8, %297 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%298, %299, %300 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> - %302 = tensor.empty() : tensor<4x64x56x56xf16> - %303 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%301#0, %6 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%302 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> - %304 = tensor.empty() : tensor<64x64x3x3xf16> - %305 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%31, %301#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%304 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16> - %306 = call @Unknown137(%288, %303) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> - %307 = tensor.empty() : tensor<4x64x112x112xf16> - %308 = byre.compute_on_tensor @PoolMaxGradOp_f16f16_f16 {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} ins(%29#0, %306 : tensor<4x64x112x112xf16>, tensor<4x64x56x56xf16>) outs(%307 : tensor<4x64x112x112xf16>) : tensor<4x64x112x112xf16> - %309 = call @Unknown138(%29#1, %308) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> - %310 = tensor.empty() : tensor<4x64x112x112xf16> - %311 = tensor.empty() : tensor<64xf32> - %312 = tensor.empty() : tensor<64xf32> - %313:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%3, %arg3, %309 : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) outs(%310, %311, %312 : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32> - %314 = tensor.empty() : tensor<64x3x7x7xf16> - %315 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%0, %313#0 : tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) outs(%314 : tensor<64x3x7x7xf16>) : tensor<64x3x7x7xf16> - %316 = tensor.empty() : tensor - %317 = byre.compute_on_tensor @ReduceSumOp_f32_f32 {dimensions = dense<[0, 1]> : tensor<2xi64>} ins(%135#1 : tensor<4x1000xf32>) outs(%316 : tensor) : tensor - %318 = call @Unknown141(%317) : (tensor) -> tensor - %319 = call @Unknown142(%315) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> - %320 = call @Unknown143(%305) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %321 = call @Unknown144(%296) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %322 = call @Unknown145(%287) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %323 = call @Unknown146(%278) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> - %324 = call @Unknown147(%261) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> - %325 = call @Unknown148(%252) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %326 = call @Unknown149(%269) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> - %327 = call @Unknown150(%243) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %328 = call @Unknown151(%234) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> - %329 = call @Unknown152(%217) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> - %330 = call @Unknown153(%208) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %331 = call @Unknown154(%225) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> - %332 = call @Unknown155(%199) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %333 = call @Unknown156(%190) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> - %334 = call @Unknown157(%173) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> - %335 = call @Unknown158(%164) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %336 = call @Unknown159(%181) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> - %337 = call @Unknown160(%155) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %338 = call @Unknown161(%146) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> - %339 = tensor.empty() : tensor<1000x512xf16> - %340 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 0 : i64, output_transpose, rhs_contracting_dimension = 0 : i64} ins(%126, %135#0 : tensor<4x512xf16>, tensor<4x1000xf16>) outs(%339 : tensor<1000x512xf16>) : tensor<1000x512xf16> - %341 = call @Unknown163(%340) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> - %342 = tensor.empty() : tensor<1000xf32> - %343 = byre.compute_on_tensor @ReduceSumOp_f32_f32 {dimensions = dense<0> : tensor<1xi64>} ins(%135#2 : tensor<4x1000xf32>) outs(%342 : tensor<1000xf32>) : tensor<1000xf32> - %344 = call @Unknown164(%343) : (tensor<1000xf32>) -> tensor<1000xf32> - return %318, %319, %313#1, %313#2, %320, %301#1, %301#2, %321, %292#1, %292#2, %322, %283#1, %283#2, %323, %274#1, %274#2, %324, %257#1, %257#2, %325, %248#1, %248#2, %326, %265#1, %265#2, %327, %239#1, %239#2, %328, %230#1, %230#2, %329, %213#1, %213#2, %330, %204#1, %204#2, %331, %221#1, %221#2, %332, %195#1, %195#2, %333, %186#1, %186#2, %334, %169#1, %169#2, %335, %160#1, %160#2, %336, %177#1, %177#2, %337, %151#1, %151#2, %338, %142#1, %142#2, %341, %344 : tensor, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32> + %123:2 = call @Unknown57(%122, %113#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) + %124 = call @Unknown62(%123#0) : (tensor<4x512x7x7xf16>) -> tensor<4x512xf16> + %125 = call @Unknown63(%124) : (tensor<4x512xf16>) -> tensor<4x512xf16> + %126 = tensor.empty() : tensor<4x1000xf16> + %127 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%125, %26 : tensor<4x512xf16>, tensor<1000x512xf16>) outs(%126 : tensor<4x1000xf16>) : tensor<4x1000xf16> + %128 = call @Unknown64(%27, %127) : (tensor<1000xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16> + %129 = call @Unknown65(%128) : (tensor<4x1000xf16>) -> tensor<4xf16> + %130 = call @Unknown66(%129, %128) : (tensor<4xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16> + %131 = call @Unknown67(%130) : (tensor<4x1000xf16>) -> tensor<4xf16> + %132 = call @Unknown68(%131) : (tensor<4xf16>) -> tensor<4xf16> + %133:2 = call @Unknown69(%132, %130, %28, %25) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) + %134 = tensor.empty() : tensor<4x512xf16> + %135 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%133#1, %26 : tensor<4x1000xf16>, tensor<1000x512xf16>) outs(%134 : tensor<4x512xf16>) : tensor<4x512xf16> + %136 = call @Unknown70(%135, %123#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> + %137 = tensor.empty() : tensor<4x512x7x7xf16> + %138 = tensor.empty() : tensor<512xf32> + %139 = tensor.empty() : tensor<512xf32> + %140:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%120, %arg98, %136 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%137, %138, %139 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> + %141 = tensor.empty() : tensor<4x512x7x7xf16> + %142 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%140#0, %24 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%141 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> + %143 = tensor.empty() : tensor<512x512x3x3xf16> + %144 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%118#0, %140#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%143 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16> + %145 = call @Unknown74(%118#1, %142) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> + %146 = tensor.empty() : tensor<4x512x7x7xf16> + %147 = tensor.empty() : tensor<512xf32> + %148 = tensor.empty() : tensor<512xf32> + %149:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%115, %arg93, %145 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%146, %147, %148 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> + %150 = tensor.empty() : tensor<4x512x7x7xf16> + %151 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%149#0, %23 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%150 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> + %152 = tensor.empty() : tensor<512x512x3x3xf16> + %153 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%113#0, %149#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%152 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16> + %154 = call @Unknown78(%136, %151, %113#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> + %155 = tensor.empty() : tensor<4x512x7x7xf16> + %156 = tensor.empty() : tensor<512xf32> + %157 = tensor.empty() : tensor<512xf32> + %158:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%110, %arg83, %154 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%155, %156, %157 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> + %159 = tensor.empty() : tensor<4x512x7x7xf16> + %160 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%158#0, %22 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%159 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16> + %161 = tensor.empty() : tensor<512x512x3x3xf16> + %162 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%108#0, %158#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%161 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16> + %163 = call @Unknown74(%108#1, %160) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> + %164 = tensor.empty() : tensor<4x512x7x7xf16> + %165 = tensor.empty() : tensor<512xf32> + %166 = tensor.empty() : tensor<512xf32> + %167:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%105, %arg78, %163 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%164, %165, %166 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> + %168 = tensor.empty() : tensor<4x256x14x14xf16> + %169 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%167#0, %21 : tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) outs(%168 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> + %170 = tensor.empty() : tensor<512x256x3x3xf16> + %171 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %167#0 : tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) outs(%170 : tensor<512x256x3x3xf16>) : tensor<512x256x3x3xf16> + %172 = tensor.empty() : tensor<4x512x7x7xf16> + %173 = tensor.empty() : tensor<512xf32> + %174 = tensor.empty() : tensor<512xf32> + %175:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%101, %arg88, %154 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%172, %173, %174 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32> + %176 = tensor.empty() : tensor<4x256x14x14xf16> + %177 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%175#0, %20 : tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) outs(%176 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> + %178 = tensor.empty() : tensor<512x256x1x1xf16> + %179 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %175#0 : tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) outs(%178 : tensor<512x256x1x1xf16>) : tensor<512x256x1x1xf16> + %180 = call @Unknown89(%177, %169, %99#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> + %181 = tensor.empty() : tensor<4x256x14x14xf16> + %182 = tensor.empty() : tensor<256xf32> + %183 = tensor.empty() : tensor<256xf32> + %184:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%96, %arg73, %180 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%181, %182, %183 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> + %185 = tensor.empty() : tensor<4x256x14x14xf16> + %186 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%184#0, %19 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%185 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> + %187 = tensor.empty() : tensor<256x256x3x3xf16> + %188 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%94#0, %184#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%187 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16> + %189 = call @Unknown93(%94#1, %186) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> + %190 = tensor.empty() : tensor<4x256x14x14xf16> + %191 = tensor.empty() : tensor<256xf32> + %192 = tensor.empty() : tensor<256xf32> + %193:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%91, %arg68, %189 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%190, %191, %192 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> + %194 = tensor.empty() : tensor<4x256x14x14xf16> + %195 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%193#0, %18 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%194 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> + %196 = tensor.empty() : tensor<256x256x3x3xf16> + %197 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%89#0, %193#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%196 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16> + %198 = call @Unknown89(%180, %195, %89#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> + %199 = tensor.empty() : tensor<4x256x14x14xf16> + %200 = tensor.empty() : tensor<256xf32> + %201 = tensor.empty() : tensor<256xf32> + %202:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%86, %arg58, %198 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%199, %200, %201 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> + %203 = tensor.empty() : tensor<4x256x14x14xf16> + %204 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%202#0, %17 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%203 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16> + %205 = tensor.empty() : tensor<256x256x3x3xf16> + %206 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%84#0, %202#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%205 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16> + %207 = call @Unknown93(%84#1, %204) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> + %208 = tensor.empty() : tensor<4x256x14x14xf16> + %209 = tensor.empty() : tensor<256xf32> + %210 = tensor.empty() : tensor<256xf32> + %211:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%81, %arg53, %207 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%208, %209, %210 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> + %212 = tensor.empty() : tensor<4x128x28x28xf16> + %213 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%211#0, %16 : tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) outs(%212 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> + %214 = tensor.empty() : tensor<256x128x3x3xf16> + %215 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %211#0 : tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) outs(%214 : tensor<256x128x3x3xf16>) : tensor<256x128x3x3xf16> + %216 = tensor.empty() : tensor<4x256x14x14xf16> + %217 = tensor.empty() : tensor<256xf32> + %218 = tensor.empty() : tensor<256xf32> + %219:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%77, %arg63, %198 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%216, %217, %218 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32> + %220 = tensor.empty() : tensor<4x128x28x28xf16> + %221 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%219#0, %15 : tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) outs(%220 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> + %222 = tensor.empty() : tensor<256x128x1x1xf16> + %223 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %219#0 : tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) outs(%222 : tensor<256x128x1x1xf16>) : tensor<256x128x1x1xf16> + %224 = call @Unknown108(%221, %213, %75#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> + %225 = tensor.empty() : tensor<4x128x28x28xf16> + %226 = tensor.empty() : tensor<128xf32> + %227 = tensor.empty() : tensor<128xf32> + %228:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%72, %arg48, %224 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%225, %226, %227 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> + %229 = tensor.empty() : tensor<4x128x28x28xf16> + %230 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%228#0, %14 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%229 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> + %231 = tensor.empty() : tensor<128x128x3x3xf16> + %232 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%70#0, %228#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%231 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16> + %233 = call @Unknown112(%70#1, %230) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> + %234 = tensor.empty() : tensor<4x128x28x28xf16> + %235 = tensor.empty() : tensor<128xf32> + %236 = tensor.empty() : tensor<128xf32> + %237:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%67, %arg43, %233 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%234, %235, %236 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> + %238 = tensor.empty() : tensor<4x128x28x28xf16> + %239 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%237#0, %13 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%238 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> + %240 = tensor.empty() : tensor<128x128x3x3xf16> + %241 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%65#0, %237#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%240 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16> + %242 = call @Unknown108(%224, %239, %65#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> + %243 = tensor.empty() : tensor<4x128x28x28xf16> + %244 = tensor.empty() : tensor<128xf32> + %245 = tensor.empty() : tensor<128xf32> + %246:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%62, %arg33, %242 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%243, %244, %245 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> + %247 = tensor.empty() : tensor<4x128x28x28xf16> + %248 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%246#0, %12 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%247 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16> + %249 = tensor.empty() : tensor<128x128x3x3xf16> + %250 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%60#0, %246#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%249 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16> + %251 = call @Unknown112(%60#1, %248) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> + %252 = tensor.empty() : tensor<4x128x28x28xf16> + %253 = tensor.empty() : tensor<128xf32> + %254 = tensor.empty() : tensor<128xf32> + %255:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%57, %arg28, %251 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%252, %253, %254 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> + %256 = tensor.empty() : tensor<4x64x56x56xf16> + %257 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%255#0, %11 : tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) outs(%256 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> + %258 = tensor.empty() : tensor<128x64x3x3xf16> + %259 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %255#0 : tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) outs(%258 : tensor<128x64x3x3xf16>) : tensor<128x64x3x3xf16> + %260 = tensor.empty() : tensor<4x128x28x28xf16> + %261 = tensor.empty() : tensor<128xf32> + %262 = tensor.empty() : tensor<128xf32> + %263:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%53, %arg38, %242 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%260, %261, %262 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32> + %264 = tensor.empty() : tensor<4x64x56x56xf16> + %265 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%263#0, %10 : tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) outs(%264 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> + %266 = tensor.empty() : tensor<128x64x1x1xf16> + %267 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %263#0 : tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) outs(%266 : tensor<128x64x1x1xf16>) : tensor<128x64x1x1xf16> + %268 = call @Unknown127(%265, %257, %51#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> + %269 = tensor.empty() : tensor<4x64x56x56xf16> + %270 = tensor.empty() : tensor<64xf32> + %271 = tensor.empty() : tensor<64xf32> + %272:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%48, %arg23, %268 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%269, %270, %271 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> + %273 = tensor.empty() : tensor<4x64x56x56xf16> + %274 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%272#0, %9 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%273 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> + %275 = tensor.empty() : tensor<64x64x3x3xf16> + %276 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%46#0, %272#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%275 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16> + %277 = call @Unknown131(%46#1, %274) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> + %278 = tensor.empty() : tensor<4x64x56x56xf16> + %279 = tensor.empty() : tensor<64xf32> + %280 = tensor.empty() : tensor<64xf32> + %281:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%43, %arg18, %277 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%278, %279, %280 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> + %282 = tensor.empty() : tensor<4x64x56x56xf16> + %283 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%281#0, %8 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%282 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> + %284 = tensor.empty() : tensor<64x64x3x3xf16> + %285 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%41#0, %281#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%284 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16> + %286 = call @Unknown127(%268, %283, %41#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> + %287 = tensor.empty() : tensor<4x64x56x56xf16> + %288 = tensor.empty() : tensor<64xf32> + %289 = tensor.empty() : tensor<64xf32> + %290:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%38, %arg13, %286 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%287, %288, %289 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> + %291 = tensor.empty() : tensor<4x64x56x56xf16> + %292 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%290#0, %7 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%291 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> + %293 = tensor.empty() : tensor<64x64x3x3xf16> + %294 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%36#0, %290#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%293 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16> + %295 = call @Unknown131(%36#1, %292) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> + %296 = tensor.empty() : tensor<4x64x56x56xf16> + %297 = tensor.empty() : tensor<64xf32> + %298 = tensor.empty() : tensor<64xf32> + %299:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%33, %arg8, %295 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%296, %297, %298 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32> + %300 = tensor.empty() : tensor<4x64x56x56xf16> + %301 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%299#0, %6 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%300 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16> + %302 = tensor.empty() : tensor<64x64x3x3xf16> + %303 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%31, %299#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%302 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16> + %304 = call @Unknown143(%286, %301) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> + %305 = tensor.empty() : tensor<4x64x112x112xf16> + %306 = byre.compute_on_tensor @PoolMaxGradOp_f16f16_f16 {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} ins(%29#0, %304 : tensor<4x64x112x112xf16>, tensor<4x64x56x56xf16>) outs(%305 : tensor<4x64x112x112xf16>) : tensor<4x64x112x112xf16> + %307 = call @Unknown144(%29#1, %306) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> + %308 = tensor.empty() : tensor<4x64x112x112xf16> + %309 = tensor.empty() : tensor<64xf32> + %310 = tensor.empty() : tensor<64xf32> + %311:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%3, %arg3, %307 : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) outs(%308, %309, %310 : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32> + %312 = tensor.empty() : tensor<64x3x7x7xf16> + %313 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%0, %311#0 : tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) outs(%312 : tensor<64x3x7x7xf16>) : tensor<64x3x7x7xf16> + %314 = call @Unknown147(%133#0, %arg1) : (tensor<4x1000xf16>, tensor<4x1000xf32>) -> tensor + %315 = call @Unknown148(%314) : (tensor) -> tensor + %316 = call @Unknown149(%313) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> + %317 = call @Unknown150(%303) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %318 = call @Unknown150(%294) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %319 = call @Unknown150(%285) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %320 = call @Unknown150(%276) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> + %321 = call @Unknown154(%259) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> + %322 = call @Unknown155(%250) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %323 = call @Unknown156(%267) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> + %324 = call @Unknown155(%241) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %325 = call @Unknown155(%232) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> + %326 = call @Unknown159(%215) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> + %327 = call @Unknown160(%206) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %328 = call @Unknown161(%223) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> + %329 = call @Unknown160(%197) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %330 = call @Unknown160(%188) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> + %331 = call @Unknown164(%171) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> + %332 = call @Unknown165(%162) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %333 = call @Unknown166(%179) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> + %334 = call @Unknown165(%153) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %335 = call @Unknown165(%144) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> + %336 = tensor.empty() : tensor<1000x512xf16> + %337 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 0 : i64, output_transpose, rhs_contracting_dimension = 0 : i64} ins(%125, %133#1 : tensor<4x512xf16>, tensor<4x1000xf16>) outs(%336 : tensor<1000x512xf16>) : tensor<1000x512xf16> + %338 = call @Unknown170(%337) : (tensor<1000x512xf16>) -> tensor<1000x512xf32> + %339 = call @Unknown171(%133#1) : (tensor<4x1000xf16>) -> tensor<1000xf32> + %340 = call @Unknown172(%339) : (tensor<1000xf32>) -> tensor<1000xf32> + return %315, %316, %311#1, %311#2, %317, %299#1, %299#2, %318, %290#1, %290#2, %319, %281#1, %281#2, %320, %272#1, %272#2, %321, %255#1, %255#2, %322, %246#1, %246#2, %323, %263#1, %263#2, %324, %237#1, %237#2, %325, %228#1, %228#2, %326, %211#1, %211#2, %327, %202#1, %202#2, %328, %219#1, %219#2, %329, %193#1, %193#2, %330, %184#1, %184#2, %331, %167#1, %167#2, %332, %158#1, %158#2, %333, %175#1, %175#2, %334, %149#1, %149#2, %335, %140#1, %140#2, %338, %340 : tensor, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/Whole/5_affine_opt.mlir b/compiler/test/E2E/ResNet18/Whole/5_affine_opt.mlir index 0a797cefb..9f4640116 100644 --- a/compiler/test/E2E/ResNet18/Whole/5_affine_opt.mlir +++ b/compiler/test/E2E/ResNet18/Whole/5_affine_opt.mlir @@ -2,676 +2,1631 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1) -> (d0, d1)> -#map2 = affine_map<(d0, d1) -> (d1)> -#map3 = affine_map<(d0, d1) -> (d0)> -#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -#map5 = affine_map<() -> ()> -#map6 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> +#map1 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024, 1000)> +#map2 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024 + 2, 1000)> +#map3 = affine_map<(d0, d1) -> (d0 - d1)> +#map4 = affine_map<(d0) -> (d0 * 2)> +#map5 = affine_map<(d0) -> (d0 * 2 + 1)> +#map6 = affine_map<(d0) -> (d0 mod 64, 49)> +#map7 = affine_map<(d0) -> (d0 mod 64 + 1, 49)> +#map8 = affine_map<(d0) -> (d0 mod 128, 125)> +#map9 = affine_map<(d0) -> (d0 mod 128 + 1, 125)> +#map10 = affine_map<(d0)[s0] -> (d0 * 32 + s0)> +#map11 = affine_map<(d0) -> (d0 * -32 + 1000, 32)> +#map12 = affine_map<(d0) -> (d0 * 32)> +#map13 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0)> +#map14 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0 + 1)> module @IrToMhlo.2452 { func.func private @Unknown0(%arg0: memref<4x3x224x224xf32>) -> memref<4x3x224x224xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c224 = arith.constant 224 : index %alloc = memref.alloc() : memref<4x3x224x224xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x3x224x224xf32>) outs(%alloc : memref<4x3x224x224xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c3 step %c1 { + scf.for %arg3 = %c0 to %c224 step %c1 { + scf.for %arg4 = %c0 to %c224 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x3x224x224xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x3x224x224xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x3x224x224xf16> } func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<64x3x7x7xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf32>) outs(%alloc : memref<64x3x7x7xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c3 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<64x3x7x7xf16> } func.func private @Unknown3(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown5(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown6(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<64x64x3x3xf16> } func.func private @Unknown7(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<128x64x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf32>) outs(%alloc : memref<128x64x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<128x64x1x1xf16> } func.func private @Unknown8(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf32>) outs(%alloc : memref<128x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<128x64x3x3xf16> } func.func private @Unknown9(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown10(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown11(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<128x128x3x3xf16> } func.func private @Unknown12(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<256x128x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf32>) outs(%alloc : memref<256x128x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<256x128x1x1xf16> } func.func private @Unknown13(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf32>) outs(%alloc : memref<256x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<256x128x3x3xf16> } func.func private @Unknown14(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown15(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown16(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<256x256x3x3xf16> } func.func private @Unknown17(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf32>) outs(%alloc : memref<512x256x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<512x256x1x1xf16> } func.func private @Unknown18(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf32>) outs(%alloc : memref<512x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<512x256x3x3xf16> } func.func private @Unknown19(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown20(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown21(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<512x512x3x3xf16> } func.func private @Unknown22(%arg0: memref<4x1000xf32>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant -2.500000e-01 : f32 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<4x1000xf16> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<4x1000xf32>) outs(%alloc : memref<4x1000xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.mulf %in, %cst : f32 - %1 = arith.truncf %0 : f32 to f16 - linalg.yield %1 : f16 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<4x1000xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.mulf %in, %cst : f32 + %1 = arith.truncf %0 : f32 to f16 + linalg.yield %1 : f16 + } + } } return %alloc : memref<4x1000xf16> } func.func private @Unknown23(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<1000x512xf16> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf32>) outs(%alloc : memref<1000x512xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c1000 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<1000x512xf16> } - func.func private @Unknown24(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown24(%arg0: memref<1000xf32>) -> memref<1000xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %alloc = memref.alloc() : memref<1000xf16> + scf.for %arg1 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg1] [1] [1] : memref<1000xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + return %alloc : memref<1000xf16> + } + func.func private @Unknown25(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<4xf16> + scf.forall (%arg1) in (4) { + %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + scf.forall (%arg2) in (512) { + %0 = affine.min #map1(%arg2) + %1 = affine.min #map2(%arg2) + %2 = affine.apply #map3(%1, %0) + %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4 = scf.if %3 -> (f16) { + %9 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %9 : f16 + } else { + scf.yield %cst : f16 + } + %5 = arith.addf %4, %cst : f16 + %6 = arith.cmpi ugt, %2, %c1 : index + %7 = scf.if %6 -> (f16) { + %9 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %9 : f16 + } else { + scf.yield %cst : f16 + } + %8 = arith.addf %5, %7 : f16 + memref.store %8, %alloca[%arg2] : memref<512xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space> + scf.forall (%arg2) in (256) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space> + scf.forall (%arg2) in (128) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<4xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref<4xf16> + } + func.func private @Unknown26(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %alloc = memref.alloc() : memref<4x64x112x112xf16> %alloc_0 = memref.alloc() : memref<4x64x112x112xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x112x112xf16>) outs(%alloc, %alloc_0 : memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c112 step %c1 { + scf.for %arg4 = %c0 to %c112 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xi1> to memref> + %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %out: f16, %out_3: i1): + %0 = arith.maximumf %in, %cst : f16 + %1 = arith.cmpf ogt, %0, %cst : f16 + linalg.yield %0, %1 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x64x112x112xf16>, memref<4x64x112x112xi1> } - func.func private @Unknown26(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x64x56x56xf16> - %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> - } - func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c56 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref> + %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %out: f16, %out_3: i1): + %0 = arith.maximumf %in, %cst : f16 + %1 = arith.cmpf ogt, %0, %cst : f16 + linalg.yield %0, %1 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> } - func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c64 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + scf.for %arg5 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref> + %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref>, memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1): + %0 = arith.addf %in, %in_4 : f16 + %1 = arith.maximumf %0, %cst : f16 + %2 = arith.cmpf ogt, %1, %cst : f16 + linalg.yield %1, %2 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> } - func.func private @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x64x56x56xf16> - %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> - } - func.func private @Unknown35(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x128x28x28xf16> - %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> - } - func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c28 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref> + %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %out: f16, %out_3: i1): + %0 = arith.maximumf %in, %cst : f16 + %1 = arith.cmpf ogt, %0, %cst : f16 + linalg.yield %0, %1 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> } - func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c128 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + scf.for %arg5 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref> + %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref>, memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1): + %0 = arith.addf %in, %in_4 : f16 + %1 = arith.maximumf %0, %cst : f16 + %2 = arith.cmpf ogt, %1, %cst : f16 + linalg.yield %1, %2 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> } - func.func private @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x128x28x28xf16> - %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> - } - func.func private @Unknown44(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c14 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref> + %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %out: f16, %out_3: i1): + %0 = arith.maximumf %in, %cst : f16 + %1 = arith.cmpf ogt, %0, %cst : f16 + linalg.yield %0, %1 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> } - func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c256 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + scf.for %arg5 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref> + %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref>, memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1): + %0 = arith.addf %in, %in_4 : f16 + %1 = arith.maximumf %0, %cst : f16 + %2 = arith.cmpf ogt, %1, %cst : f16 + linalg.yield %1, %2 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> } - func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x256x14x14xf16> - %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> - } - func.func private @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x256x14x14xf16> - %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> - } - func.func private @Unknown53(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref> + %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %out: f16, %out_3: i1): + %0 = arith.maximumf %in, %cst : f16 + %1 = arith.cmpf ogt, %0, %cst : f16 + linalg.yield %0, %1 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> } - func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c512 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + scf.for %arg5 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref> + %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref>, memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1): + %0 = arith.addf %in, %in_4 : f16 + %1 = arith.maximumf %0, %cst : f16 + %2 = arith.cmpf ogt, %1, %cst : f16 + linalg.yield %1, %2 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> } - func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown62(%arg0: memref<4x512x7x7xf16>) -> memref<4x512xf16> attributes {__byteir_reduction_fusion__} { %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x512x7x7xf16> - %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> - } - func.func private @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x512x7x7xf16> - %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> - } - func.func private @Unknown60(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<4x512x7x7xf16> into memref<2048x49xf16> + %alloc = memref.alloc() : memref<2048xf16> + scf.forall (%arg1) in (2048) { + %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = affine.min #map6(%arg2) + %1 = affine.min #map7(%arg2) + %2 = affine.apply #map3(%1, %0) + %subview_6 = memref.subview %expand_shape_0[0, %0] [1, %2] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4 = scf.if %3 -> (f16) { + %6 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %6 : f16 + } else { + scf.yield %cst : f16 + } + %5 = arith.addf %4, %cst : f16 + memref.store %5, %alloca[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<2048xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<2048xf16> into memref<4x512xf16> + return %expand_shape : memref<4x512xf16> + } + func.func private @Unknown63(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 2.040100e-02 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<4x512xf16> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<4x512xf16>) outs(%alloc : memref<4x512xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.mulf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<4x512xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<4x512xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.mulf %in, %cst : f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<4x512xf16> } - func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<4x1000xf16> - linalg.generic {indexing_maps = [#map1, #map2, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x1000xf16>, memref<1000xf32>) outs(%alloc : memref<4x1000xf16>) { - ^bb0(%in: f16, %in_0: f32, %out: f16): - %0 = arith.truncf %in_0 : f32 to f16 - %1 = arith.addf %in, %0 : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg3] [1] [1] : memref<1000xf16> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in_2, %in : f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<4x1000xf16> } - func.func private @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown65(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<4xf16> + scf.forall (%arg1) in (4) { + %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + scf.forall (%arg2) in (512) { + %0 = affine.min #map1(%arg2) + %1 = affine.min #map2(%arg2) + %2 = affine.apply #map3(%1, %0) + %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4 = scf.if %3 -> (f16) { + %8 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %8 : f16 + } else { + scf.yield %cst : f16 + } + %5 = arith.cmpi ugt, %2, %c1 : index + %6 = scf.if %5 -> (f16) { + %8 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %8 : f16 + } else { + scf.yield %cst : f16 + } + %7 = arith.maximumf %4, %6 : f16 + memref.store %7, %alloca[%arg2] : memref<512xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space> + scf.forall (%arg2) in (256) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space> + scf.forall (%arg2) in (128) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_0[%2] : memref<256xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_1[%2] : memref<128xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_2[%2] : memref<64xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_3[%2] : memref<32xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_4[%2] : memref<16xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_5[%2] : memref<8xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_6[%2] : memref<4xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_7[%2] : memref<2xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloc[%arg1] : memref<4xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref<4xf16> + } + func.func private @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<4x1000xf16> - %alloc_0 = memref.alloc() : memref<4x1000xf16> - linalg.generic {indexing_maps = [#map1, #map3, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x1000xf16>, memref<4xf16>) outs(%alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: f16): - %0 = arith.subf %in, %in_1 : f16 - %1 = math.exp %0 : f16 - linalg.yield %0, %1 : f16, f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg2] [1] [1] : memref<4xf16> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.subf %in_2, %in : f16 + linalg.yield %0 : f16 + } + } } - return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16> + return %alloc : memref<4x1000xf16> } - func.func private @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown67(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<4xf16> + scf.forall (%arg1) in (4) { + %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + scf.forall (%arg2) in (512) { + %0 = affine.min #map1(%arg2) + %1 = affine.min #map2(%arg2) + %2 = affine.apply #map3(%1, %0) + %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4 = scf.if %3 -> (f16) { + %11 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %11 : f16 + } else { + scf.yield %cst : f16 + } + %5 = math.exp %4 : f16 + %6 = arith.addf %5, %cst : f16 + %7 = arith.cmpi ugt, %2, %c1 : index + %8 = scf.if %7 -> (f16) { + %11 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %11 : f16 + } else { + scf.yield %cst : f16 + } + %9 = math.exp %8 : f16 + %10 = arith.addf %6, %9 : f16 + memref.store %10, %alloca[%arg2] : memref<512xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space> + scf.forall (%arg2) in (256) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space> + scf.forall (%arg2) in (128) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<4xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref<4xf16> + } + func.func private @Unknown68(%arg0: memref<4xf16>) -> memref<4xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %alloc = memref.alloc() : memref<4xf16> + scf.for %arg1 = %c0 to %c4 step %c1 { + %subview = memref.subview %arg0[%arg1] [1] [1] : memref<4xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<4xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = math.log %in : f16 + linalg.yield %0 : f16 + } + } + return %alloc : memref<4xf16> + } + func.func private @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<4x1000xf16> - %alloc_0 = memref.alloc() : memref<4x1000xf32> - %alloc_1 = memref.alloc() : memref<4x1000xf32> - linalg.generic {indexing_maps = [#map1, #map1, #map3, #map3, #map1, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3, %arg1, %arg0, %arg2, %arg4 : memref<4x1000xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4xf16>, memref<4x1000xf32>) outs(%alloc, %alloc_0, %alloc_1 : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) { - ^bb0(%in: f16, %in_2: f16, %in_3: f16, %in_4: f16, %in_5: f32, %out: f16, %out_6: f32, %out_7: f32): - %0 = math.log %in_3 : f16 - %1 = arith.subf %in_2, %0 : f16 - %2 = math.exp %1 : f16 - %3 = arith.mulf %2, %in_4 : f16 - %4 = arith.subf %in, %3 : f16 - %5 = arith.extf %1 : f16 to f32 - %6 = arith.mulf %5, %in_5 : f32 - %7 = arith.extf %4 : f16 to f32 - linalg.yield %4, %6, %7 : f16, f32, f32 + %alloc_0 = memref.alloc() : memref<4x1000xf16> + scf.for %arg4 = %c0 to %c4 step %c1 { + scf.for %arg5 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg2[%arg4] [1] [1] : memref<4xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + %subview_2 = memref.subview %alloc[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + %subview_3 = memref.subview %arg0[%arg4] [1] [1] : memref<4xf16> to memref> + %subview_4 = memref.subview %arg1[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + %subview_5 = memref.subview %arg3[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3, %subview_4, %subview_5 : memref>, memref>, memref>, memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %in_6: f16, %in_7: f16, %in_8: f16, %out: f16, %out_9: f16): + %0 = arith.subf %in_7, %in_6 : f16 + %1 = math.exp %0 : f16 + %2 = arith.mulf %1, %in : f16 + %3 = arith.subf %in_8, %2 : f16 + linalg.yield %0, %3 : f16, f16 + } + } } - return %alloc, %alloc_0, %alloc_1 : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32> + return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16> } - func.func private @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 4.900000e+01 : f16 %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map4, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x512x7x7xi1>, memref<4x512xf16>) outs(%alloc : memref<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_1: f16, %out: f16): - %0 = arith.divf %in_1, %cst : f16 - %1 = arith.select %in, %0, %cst_0 : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c512 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + scf.for %arg5 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3] [1, 1] [1, 1] : memref<4x512xf16> to memref> + %subview_1 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_2 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f16, %in_3: i1, %out: f16): + %0 = arith.divf %in, %cst : f16 + %1 = arith.select %in_3, %0, %cst_0 : f16 + linalg.yield %1 : f16 + } + } + } + } } return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c512 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + scf.for %arg5 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: i1, %in_2: f16, %out: f16): + %0 = arith.select %in, %in_2, %cst : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c512 step %c1 { + scf.for %arg5 = %c0 to %c7 step %c1 { + scf.for %arg6 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.select %in_4, %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } + } } return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<4x512x7x7xf16> - } - func.func private @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<4x256x14x14xf16> - } - func.func private @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c256 step %c1 { + scf.for %arg5 = %c0 to %c14 step %c1 { + scf.for %arg6 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.select %in_4, %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } + } } return %alloc : memref<4x256x14x14xf16> } - func.func private @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c256 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + scf.for %arg5 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: i1, %in_2: f16, %out: f16): + %0 = arith.select %in, %in_2, %cst : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x256x14x14xf16> } - func.func private @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<4x256x14x14xf16> - } - func.func private @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c128 step %c1 { + scf.for %arg5 = %c0 to %c28 step %c1 { + scf.for %arg6 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.select %in_4, %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } + } } return %alloc : memref<4x128x28x28xf16> } - func.func private @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c128 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + scf.for %arg5 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: i1, %in_2: f16, %out: f16): + %0 = arith.select %in, %in_2, %cst : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x128x28x28xf16> } - func.func private @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<4x128x28x28xf16> - } - func.func private @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<4x128x28x28xf16> - } - func.func private @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c64 step %c1 { + scf.for %arg5 = %c0 to %c56 step %c1 { + scf.for %arg6 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.select %in_4, %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } + } } return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c64 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + scf.for %arg5 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: i1, %in_2: f16, %out: f16): + %0 = arith.select %in, %in_2, %cst : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + func.func private @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c64 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + scf.for %arg5 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<4x64x56x56xf16> - } - func.func private @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<4x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<4x64x56x56xf16> - } - func.func private @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %alloc = memref.alloc() : memref<4x64x112x112xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) outs(%alloc : memref<4x64x112x112xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c64 step %c1 { + scf.for %arg4 = %c0 to %c112 step %c1 { + scf.for %arg5 = %c0 to %c112 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xi1> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: i1, %in_2: f16, %out: f16): + %0 = arith.select %in, %in_2, %cst : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x64x112x112xf16> } - func.func private @Unknown141(%arg0: memref) -> memref attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown147(%arg0: memref<4x1000xf16>, %arg1: memref<4x1000xf32>) -> memref attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref + %collapse_shape = memref.collapse_shape %arg0 [[0, 1]] : memref<4x1000xf16> into memref<4000xf16> + %collapse_shape_1 = memref.collapse_shape %arg1 [[0, 1]] : memref<4x1000xf32> into memref<4000xf32> + %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<4000xf16> into memref<32x125xf16> + %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] : memref<4000xf32> into memref<32x125xf32> + %alloc_3 = memref.alloc() : memref<32xf32> + scf.forall (%arg2) in (32) { + %subview = memref.subview %expand_shape[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>> + %expand_shape_4 = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>> + %subview_5 = memref.subview %expand_shape_2[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>> + %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>> + %alloca = memref.alloca() : memref<128xf32, #gpu.address_space> + scf.forall (%arg3) in (128) { + %0 = affine.min #map8(%arg3) + %1 = affine.min #map9(%arg3) + %2 = affine.apply #map3(%1, %0) + %subview_13 = memref.subview %expand_shape_4[0, %0] [1, %2] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref> + %expand_shape_14 = memref.expand_shape %subview_13 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %subview_15 = memref.subview %expand_shape_6[0, %0] [1, %2] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref> + %expand_shape_16 = memref.expand_shape %subview_15 [[0, 1]] : memref> into memref<1x?xf32, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4:2 = scf.if %3 -> (f16, f32) { + %8 = memref.load %expand_shape_14[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + %9 = memref.load %expand_shape_16[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>> + scf.yield %8, %9 : f16, f32 + } else { + scf.yield %cst_0, %cst : f16, f32 + } + %5 = arith.extf %4#0 : f16 to f32 + %6 = arith.mulf %5, %4#1 : f32 + %7 = arith.addf %6, %cst : f32 + memref.store %7, %alloca[%arg3] : memref<128xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space> + scf.forall (%arg3) in (64) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca[%0] : memref<128xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca[%3] : memref<128xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_7[%arg3] : memref<64xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space> + scf.forall (%arg3) in (32) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_7[%0] : memref<64xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_7[%3] : memref<64xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_8[%arg3] : memref<32xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space> + scf.forall (%arg3) in (16) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_8[%0] : memref<32xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_8[%3] : memref<32xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_9[%arg3] : memref<16xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space> + scf.forall (%arg3) in (8) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_9[%0] : memref<16xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_9[%3] : memref<16xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_10[%arg3] : memref<8xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space> + scf.forall (%arg3) in (4) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_10[%0] : memref<8xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_10[%3] : memref<8xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_11[%arg3] : memref<4xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space> + scf.forall (%arg3) in (2) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_11[%0] : memref<4xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_11[%3] : memref<4xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_12[%arg3] : memref<2xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg3) in (1) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_12[%0] : memref<2xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_12[%3] : memref<2xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloc_3[%arg2] : memref<32xf32> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + scf.forall (%arg2) in (1) { + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + scf.forall (%arg3) in (32) { + %0 = affine.apply #map10(%arg2)[%arg3] + %1 = memref.load %alloc_3[%0] : memref<32xf32> + %2 = arith.addf %1, %cst : f32 + memref.store %2, %alloca[%arg3] : memref<32xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf32, #gpu.address_space> + scf.forall (%arg3) in (16) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca[%0] : memref<32xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca[%3] : memref<32xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_4[%arg3] : memref<16xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf32, #gpu.address_space> + scf.forall (%arg3) in (8) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_4[%0] : memref<16xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_4[%3] : memref<16xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_5[%arg3] : memref<8xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf32, #gpu.address_space> + scf.forall (%arg3) in (4) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_5[%0] : memref<8xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_5[%3] : memref<8xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_6[%arg3] : memref<4xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf32, #gpu.address_space> + scf.forall (%arg3) in (2) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_6[%0] : memref<4xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_6[%3] : memref<4xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_7[%arg3] : memref<2xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg3) in (1) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_7[%0] : memref<2xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_7[%3] : memref<2xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloc[] : memref + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref + } + func.func private @Unknown148(%arg0: memref) -> memref attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 4.000000e+00 : f32 %alloc = memref.alloc() : memref - linalg.generic {indexing_maps = [#map5, #map5], iterator_types = []} ins(%arg0 : memref) outs(%alloc : memref) { + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%arg0 : memref) outs(%alloc : memref) { ^bb0(%in: f32, %out: f32): %0 = arith.negf %in : f32 %1 = arith.divf %0, %cst : f32 @@ -679,202 +1634,335 @@ module @IrToMhlo.2452 { } return %alloc : memref } - func.func private @Unknown142(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown149(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<64x3x7x7xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf16>) outs(%alloc : memref<64x3x7x7xf32>) attrs = {xla_shape = "f32[64,3,7,7]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c3 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<64x3x7x7xf32> } - func.func private @Unknown143(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown144(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown145(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown146(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown150(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<64x64x3x3xf32> } - func.func private @Unknown147(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown154(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf16>) outs(%alloc : memref<128x64x3x3xf32>) attrs = {xla_shape = "f32[128,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<128x64x3x3xf32> } - func.func private @Unknown148(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown155(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<128x128x3x3xf32> } - func.func private @Unknown149(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown156(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<128x64x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf16>) outs(%alloc : memref<128x64x1x1xf32>) attrs = {xla_shape = "f32[128,64,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<128x64x1x1xf32> } - func.func private @Unknown150(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown151(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown152(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown159(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf16>) outs(%alloc : memref<256x128x3x3xf32>) attrs = {xla_shape = "f32[256,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<256x128x3x3xf32> } - func.func private @Unknown153(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown160(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<256x256x3x3xf32> } - func.func private @Unknown154(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown161(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<256x128x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf16>) outs(%alloc : memref<256x128x1x1xf32>) attrs = {xla_shape = "f32[256,128,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<256x128x1x1xf32> } - func.func private @Unknown155(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown156(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown157(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown164(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf16>) outs(%alloc : memref<512x256x3x3xf32>) attrs = {xla_shape = "f32[512,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<512x256x3x3xf32> } - func.func private @Unknown158(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown165(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<512x512x3x3xf32> } - func.func private @Unknown159(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown166(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf16>) outs(%alloc : memref<512x256x1x1xf32>) attrs = {xla_shape = "f32[512,256,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<512x256x1x1xf32> } - func.func private @Unknown160(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown161(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown163(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown170(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<1000x512xf32> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf16>) outs(%alloc : memref<1000x512xf32>) attrs = {xla_shape = "f32[1000,512]{0,1}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c1000 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<1000x512xf32> } - func.func private @Unknown164(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown171(%arg0: memref<4x1000xf16>) -> memref<1000xf32> attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<1000xf32> + scf.forall (%arg1) in (32) { + %0 = affine.min #map11(%arg1) + %1 = affine.apply #map12(%arg1) + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space> + scf.forall (%arg2, %arg3) in (2, 32) { + %2 = affine.min #map13(%arg3, %arg1) + %3 = affine.min #map14(%arg3, %arg1) + %4 = affine.apply #map3(%3, %2) + %5 = arith.cmpi ugt, %4, %c0 : index + %6 = scf.if %5 -> (f16) { + %12 = affine.apply #map4(%arg2) + %13 = affine.apply #map10(%arg1)[%2] + %14 = memref.load %arg0[%12, %13] : memref<4x1000xf16> + scf.yield %14 : f16 + } else { + scf.yield %cst_0 : f16 + } + %7 = arith.extf %6 : f16 to f32 + %8 = arith.addf %7, %cst : f32 + %9 = scf.if %5 -> (f16) { + %12 = affine.apply #map5(%arg2) + %13 = affine.apply #map10(%arg1)[%2] + %14 = memref.load %arg0[%12, %13] : memref<4x1000xf16> + scf.yield %14 : f16 + } else { + scf.yield %cst_0 : f16 + } + %10 = arith.extf %9 : f16 to f32 + %11 = arith.addf %8, %10 : f32 + memref.store %11, %alloca_1[%arg2, %arg3] : memref<2x32xf32, #gpu.address_space> + } {mapping = [#gpu.thread, #gpu.thread]} + scf.forall (%arg2) in (32) { + %2 = memref.load %alloca_1[%c0, %arg2] : memref<2x32xf32, #gpu.address_space> + %3 = arith.addf %2, %cst : f32 + %4 = memref.load %alloca_1[%c1, %arg2] : memref<2x32xf32, #gpu.address_space> + %5 = arith.addf %4, %3 : f32 + memref.store %5, %alloca[%arg2] : memref<32xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %subview = memref.subview %alloca[0] [%0] [1] : memref<32xf32, #gpu.address_space> to memref, #gpu.address_space> + %subview_2 = memref.subview %alloc[%1] [%0] [1] : memref<1000xf32> to memref> + memref.copy %subview, %subview_2 : memref, #gpu.address_space> to memref> + } {mapping = [#gpu.block]} + return %alloc : memref<1000xf32> + } + func.func private @Unknown172(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<1000xf32> - linalg.generic {indexing_maps = [#map6, #map6], iterator_types = ["parallel"]} ins(%arg0 : memref<1000xf32>) outs(%alloc : memref<1000xf32>) { - ^bb0(%in: f32, %out: f32): - %0 = arith.truncf %in : f32 to f16 - %1 = arith.extf %0 : f16 to f32 - linalg.yield %1 : f32 + scf.for %arg1 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg1] [1] [1] : memref<1000xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<1000xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f32): + %0 = arith.truncf %in : f32 to f16 + %1 = arith.extf %0 : f16 to f32 + linalg.yield %1 : f32 + } } return %alloc : memref<1000xf32> } @@ -886,344 +1974,340 @@ module @IrToMhlo.2452 { %alloc_0 = memref.alloc() : memref<4x64x112x112xf16> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc, %arg3, %arg4, %alloc_0) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x112x112xf16> %2 = call @Unknown3(%arg7) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %3 = call @Unknown4(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %4 = call @Unknown5(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %5 = call @Unknown6(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %3 = call @Unknown3(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %4 = call @Unknown3(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %5 = call @Unknown3(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %6 = call @Unknown7(%arg37) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> %7 = call @Unknown8(%arg27) : (memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> %8 = call @Unknown9(%arg32) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> - %9 = call @Unknown10(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> - %10 = call @Unknown11(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %9 = call @Unknown9(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %10 = call @Unknown9(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %11 = call @Unknown12(%arg62) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> %12 = call @Unknown13(%arg52) : (memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> %13 = call @Unknown14(%arg57) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> - %14 = call @Unknown15(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> - %15 = call @Unknown16(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %14 = call @Unknown14(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %15 = call @Unknown14(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %16 = call @Unknown17(%arg87) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> %17 = call @Unknown18(%arg77) : (memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> %18 = call @Unknown19(%arg82) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> - %19 = call @Unknown20(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> - %20 = call @Unknown21(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %19 = call @Unknown19(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %20 = call @Unknown19(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %21 = call @Unknown22(%arg1) : (memref<4x1000xf32>) -> memref<4x1000xf16> %22 = call @Unknown23(%arg102) : (memref<1000x512xf32>) -> memref<1000x512xf16> - %alloc_1 = memref.alloc() : memref<4xf16> - byre.compute @ReduceSumOp_f16_f16(%21, %alloc_1) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %23:2 = call @Unknown24(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) + %23 = call @Unknown24(%arg103) : (memref<1000xf32>) -> memref<1000xf16> + %24 = call @Unknown25(%21) : (memref<4x1000xf16>) -> memref<4xf16> + %25:2 = call @Unknown26(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) + %alloc_1 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @PoolMaxOp_f16_f16(%25#0, %alloc_1) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16> %alloc_2 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @PoolMaxOp_f16_f16(%23#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16> + byre.compute @ConvOp_f16f16_f16(%alloc_1, %2, %alloc_2) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_3 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%alloc_2, %2, %alloc_3) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_2, %arg8, %arg9, %alloc_3) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %26:2 = call @Unknown28(%alloc_3) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_4 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_3, %arg8, %arg9, %alloc_4) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %24:2 = call @Unknown26(%alloc_4) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%26#0, %3, %alloc_4) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_5 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%24#0, %3, %alloc_5) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_4, %arg13, %arg14, %alloc_5) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %27:2 = call @Unknown30(%alloc_5, %alloc_1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_6 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_5, %arg13, %arg14, %alloc_6) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %25:2 = call @Unknown28(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%27#0, %4, %alloc_6) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_7 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%25#0, %4, %alloc_7) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_6, %arg18, %arg19, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %28:2 = call @Unknown28(%alloc_7) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_8 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_7, %arg18, %arg19, %alloc_8) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %26:2 = call @Unknown30(%alloc_8) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%28#0, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_9 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%26#0, %5, %alloc_9) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_10 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_9, %arg23, %arg24, %alloc_10) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %27:2 = call @Unknown32(%alloc_10, %25#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_8, %arg23, %arg24, %alloc_9) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %29:2 = call @Unknown30(%alloc_9, %27#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + %alloc_10 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvOp_f16f16_f16(%29#0, %6, %alloc_10) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16> %alloc_11 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%27#0, %6, %alloc_11) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_10, %arg38, %arg39, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> %alloc_12 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_11, %arg38, %arg39, %alloc_12) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + byre.compute @ConvOp_f16f16_f16(%29#0, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16> %alloc_13 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%27#0, %7, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_12, %arg28, %arg29, %alloc_13) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %30:2 = call @Unknown37(%alloc_13) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_14 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_13, %arg28, %arg29, %alloc_14) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %28:2 = call @Unknown35(%alloc_14) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%30#0, %8, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_15 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%28#0, %8, %alloc_15) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_14, %arg33, %arg34, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %31:2 = call @Unknown39(%alloc_15, %alloc_11) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_16 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_15, %arg33, %arg34, %alloc_16) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %29:2 = call @Unknown37(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%31#0, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_17 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%29#0, %9, %alloc_17) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_16, %arg43, %arg44, %alloc_17) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %32:2 = call @Unknown37(%alloc_17) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_18 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_17, %arg43, %arg44, %alloc_18) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %30:2 = call @Unknown39(%alloc_18) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%32#0, %10, %alloc_18) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_19 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%30#0, %10, %alloc_19) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_20 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_19, %arg48, %arg49, %alloc_20) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %31:2 = call @Unknown41(%alloc_20, %29#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_18, %arg48, %arg49, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %33:2 = call @Unknown39(%alloc_19, %31#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + %alloc_20 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvOp_f16f16_f16(%33#0, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16> %alloc_21 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%31#0, %11, %alloc_21) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_20, %arg63, %arg64, %alloc_21) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> %alloc_22 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_21, %arg63, %arg64, %alloc_22) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + byre.compute @ConvOp_f16f16_f16(%33#0, %12, %alloc_22) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16> %alloc_23 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%31#0, %12, %alloc_23) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_22, %arg53, %arg54, %alloc_23) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %34:2 = call @Unknown46(%alloc_23) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_24 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_23, %arg53, %arg54, %alloc_24) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %32:2 = call @Unknown44(%alloc_24) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%34#0, %13, %alloc_24) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_25 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%32#0, %13, %alloc_25) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_24, %arg58, %arg59, %alloc_25) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %35:2 = call @Unknown48(%alloc_25, %alloc_21) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_26 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_25, %arg58, %arg59, %alloc_26) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %33:2 = call @Unknown46(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%35#0, %14, %alloc_26) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_27 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%33#0, %14, %alloc_27) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_26, %arg68, %arg69, %alloc_27) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %36:2 = call @Unknown46(%alloc_27) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_28 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_27, %arg68, %arg69, %alloc_28) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %34:2 = call @Unknown48(%alloc_28) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%36#0, %15, %alloc_28) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_29 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%34#0, %15, %alloc_29) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_30 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_29, %arg73, %arg74, %alloc_30) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %35:2 = call @Unknown50(%alloc_30, %33#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_28, %arg73, %arg74, %alloc_29) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %37:2 = call @Unknown48(%alloc_29, %35#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + %alloc_30 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvOp_f16f16_f16(%37#0, %16, %alloc_30) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16> %alloc_31 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%35#0, %16, %alloc_31) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_30, %arg88, %arg89, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> %alloc_32 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_31, %arg88, %arg89, %alloc_32) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + byre.compute @ConvOp_f16f16_f16(%37#0, %17, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16> %alloc_33 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%35#0, %17, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_32, %arg78, %arg79, %alloc_33) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %38:2 = call @Unknown55(%alloc_33) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_34 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_33, %arg78, %arg79, %alloc_34) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %36:2 = call @Unknown53(%alloc_34) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%38#0, %18, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_35 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%36#0, %18, %alloc_35) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_34, %arg83, %arg84, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %39:2 = call @Unknown57(%alloc_35, %alloc_31) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_36 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_35, %arg83, %arg84, %alloc_36) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %37:2 = call @Unknown55(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%39#0, %19, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_37 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%37#0, %19, %alloc_37) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_36, %arg93, %arg94, %alloc_37) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %40:2 = call @Unknown55(%alloc_37) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_38 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_37, %arg93, %arg94, %alloc_38) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %38:2 = call @Unknown57(%alloc_38) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%40#0, %20, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_39 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%38#0, %20, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_40 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_39, %arg98, %arg99, %alloc_40) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %39:2 = call @Unknown59(%alloc_40, %37#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_38, %arg98, %arg99, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %41:2 = call @Unknown57(%alloc_39, %39#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + %42 = call @Unknown62(%41#0) : (memref<4x512x7x7xf16>) -> memref<4x512xf16> + %43 = call @Unknown63(%42) : (memref<4x512xf16>) -> memref<4x512xf16> + %alloc_40 = memref.alloc() : memref<4x1000xf16> + byre.compute @MatmulOp_f16f16_f16(%43, %22, %alloc_40) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16> + %44 = call @Unknown64(%23, %alloc_40) : (memref<1000xf16>, memref<4x1000xf16>) -> memref<4x1000xf16> + %45 = call @Unknown65(%44) : (memref<4x1000xf16>) -> memref<4xf16> + %46 = call @Unknown66(%45, %44) : (memref<4xf16>, memref<4x1000xf16>) -> memref<4x1000xf16> + %47 = call @Unknown67(%46) : (memref<4x1000xf16>) -> memref<4xf16> + %48 = call @Unknown68(%47) : (memref<4xf16>) -> memref<4xf16> + %49:2 = call @Unknown69(%48, %46, %24, %21) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) %alloc_41 = memref.alloc() : memref<4x512xf16> - byre.compute @ReduceSumOp_f16_f16(%39#0, %alloc_41) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<4x512xf16> - %40 = call @Unknown60(%alloc_41) : (memref<4x512xf16>) -> memref<4x512xf16> - %alloc_42 = memref.alloc() : memref<4x1000xf16> - byre.compute @MatmulOp_f16f16_f16(%40, %22, %alloc_42) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16> - %41 = call @Unknown61(%arg103, %alloc_42) : (memref<1000xf32>, memref<4x1000xf16>) -> memref<4x1000xf16> - %alloc_43 = memref.alloc() : memref<4xf16> - byre.compute @ReduceMaxOp_f16_f16(%41, %alloc_43) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %42:2 = call @Unknown62(%alloc_43, %41) : (memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) - %alloc_44 = memref.alloc() : memref<4xf16> - byre.compute @ReduceSumOp_f16_f16(%42#1, %alloc_44) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %43:3 = call @Unknown63(%alloc_44, %42#0, %alloc_1, %21, %arg1) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>, memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) - %alloc_45 = memref.alloc() : memref<4x512xf16> - byre.compute @MatmulOp_f16f16_f16(%43#0, %22, %alloc_45) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16> - %44 = call @Unknown64(%alloc_45, %39#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> - %alloc_46 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_47 = memref.alloc() : memref<512xf32> + byre.compute @MatmulOp_f16f16_f16(%49#1, %22, %alloc_41) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16> + %50 = call @Unknown70(%alloc_41, %41#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> + %alloc_42 = memref.alloc() : memref<4x512x7x7xf16> + %alloc_43 = memref.alloc() : memref<512xf32> + %alloc_44 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_38, %arg98, %50, %alloc_42, %alloc_43, %alloc_44) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_45 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_42, %20, %alloc_45) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_46 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%40#0, %alloc_42, %alloc_46) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %51 = call @Unknown74(%40#1, %alloc_45) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> + %alloc_47 = memref.alloc() : memref<4x512x7x7xf16> %alloc_48 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %44, %alloc_46, %alloc_47, %alloc_48) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_49 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_46, %20, %alloc_49) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_50 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %45 = call @Unknown68(%38#1, %alloc_49) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> - %alloc_51 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_52 = memref.alloc() : memref<512xf32> + %alloc_49 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_36, %arg93, %51, %alloc_47, %alloc_48, %alloc_49) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_50 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_47, %19, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_51 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%39#0, %alloc_47, %alloc_51) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %52 = call @Unknown78(%50, %alloc_50, %39#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> + %alloc_52 = memref.alloc() : memref<4x512x7x7xf16> %alloc_53 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %45, %alloc_51, %alloc_52, %alloc_53) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_54 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_51, %19, %alloc_54) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_55 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %46 = call @Unknown72(%44, %alloc_54, %37#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> - %alloc_56 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_57 = memref.alloc() : memref<512xf32> + %alloc_54 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_34, %arg83, %52, %alloc_52, %alloc_53, %alloc_54) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_55 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_52, %18, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_56 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_52, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %53 = call @Unknown74(%38#1, %alloc_55) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> + %alloc_57 = memref.alloc() : memref<4x512x7x7xf16> %alloc_58 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %46, %alloc_56, %alloc_57, %alloc_58) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_59 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_56, %18, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_60 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %47 = call @Unknown76(%36#1, %alloc_59) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> - %alloc_61 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_62 = memref.alloc() : memref<512xf32> + %alloc_59 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_32, %arg78, %53, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_60 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_57, %17, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_61 = memref.alloc() : memref<512x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_57, %alloc_61) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16> + %alloc_62 = memref.alloc() : memref<4x512x7x7xf16> %alloc_63 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %47, %alloc_61, %alloc_62, %alloc_63) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_64 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_61, %17, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_65 = memref.alloc() : memref<512x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16> - %alloc_66 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_67 = memref.alloc() : memref<512xf32> - %alloc_68 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %46, %alloc_66, %alloc_67, %alloc_68) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_69 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_66, %16, %alloc_69) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16> - %alloc_70 = memref.alloc() : memref<512x256x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16> - %48 = call @Unknown83(%alloc_69, %alloc_64, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> - %alloc_71 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_72 = memref.alloc() : memref<256xf32> + %alloc_64 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_30, %arg88, %52, %alloc_62, %alloc_63, %alloc_64) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_65 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_62, %16, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16> + %alloc_66 = memref.alloc() : memref<512x256x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_62, %alloc_66) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16> + %54 = call @Unknown89(%alloc_65, %alloc_60, %37#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> + %alloc_67 = memref.alloc() : memref<4x256x14x14xf16> + %alloc_68 = memref.alloc() : memref<256xf32> + %alloc_69 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_28, %arg73, %54, %alloc_67, %alloc_68, %alloc_69) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_70 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_67, %15, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_71 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_67, %alloc_71) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %55 = call @Unknown93(%36#1, %alloc_70) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> + %alloc_72 = memref.alloc() : memref<4x256x14x14xf16> %alloc_73 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %48, %alloc_71, %alloc_72, %alloc_73) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_74 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_71, %15, %alloc_74) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_75 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %49 = call @Unknown87(%34#1, %alloc_74) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> - %alloc_76 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_77 = memref.alloc() : memref<256xf32> + %alloc_74 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_26, %arg68, %55, %alloc_72, %alloc_73, %alloc_74) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_75 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_72, %14, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_76 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_72, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %56 = call @Unknown89(%54, %alloc_75, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> + %alloc_77 = memref.alloc() : memref<4x256x14x14xf16> %alloc_78 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %49, %alloc_76, %alloc_77, %alloc_78) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_79 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_76, %14, %alloc_79) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_80 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %50 = call @Unknown91(%48, %alloc_79, %33#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> - %alloc_81 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_82 = memref.alloc() : memref<256xf32> + %alloc_79 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_24, %arg58, %56, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_80 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_77, %13, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_81 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_77, %alloc_81) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %57 = call @Unknown93(%34#1, %alloc_80) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> + %alloc_82 = memref.alloc() : memref<4x256x14x14xf16> %alloc_83 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %50, %alloc_81, %alloc_82, %alloc_83) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_84 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_81, %13, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_85 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %51 = call @Unknown95(%32#1, %alloc_84) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> - %alloc_86 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_87 = memref.alloc() : memref<256xf32> + %alloc_84 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_22, %arg53, %57, %alloc_82, %alloc_83, %alloc_84) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_85 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_82, %12, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_86 = memref.alloc() : memref<256x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_82, %alloc_86) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16> + %alloc_87 = memref.alloc() : memref<4x256x14x14xf16> %alloc_88 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %51, %alloc_86, %alloc_87, %alloc_88) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_89 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_86, %12, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_90 = memref.alloc() : memref<256x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16> - %alloc_91 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_92 = memref.alloc() : memref<256xf32> - %alloc_93 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %50, %alloc_91, %alloc_92, %alloc_93) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_94 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_91, %11, %alloc_94) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16> - %alloc_95 = memref.alloc() : memref<256x128x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16> - %52 = call @Unknown102(%alloc_94, %alloc_89, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> - %alloc_96 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_97 = memref.alloc() : memref<128xf32> + %alloc_89 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_20, %arg63, %56, %alloc_87, %alloc_88, %alloc_89) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_90 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_87, %11, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16> + %alloc_91 = memref.alloc() : memref<256x128x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_87, %alloc_91) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16> + %58 = call @Unknown108(%alloc_90, %alloc_85, %33#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> + %alloc_92 = memref.alloc() : memref<4x128x28x28xf16> + %alloc_93 = memref.alloc() : memref<128xf32> + %alloc_94 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_18, %arg48, %58, %alloc_92, %alloc_93, %alloc_94) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_95 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_92, %10, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_96 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_92, %alloc_96) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %59 = call @Unknown112(%32#1, %alloc_95) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> + %alloc_97 = memref.alloc() : memref<4x128x28x28xf16> %alloc_98 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %52, %alloc_96, %alloc_97, %alloc_98) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_99 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_96, %10, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_100 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %53 = call @Unknown106(%30#1, %alloc_99) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> - %alloc_101 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_102 = memref.alloc() : memref<128xf32> + %alloc_99 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_16, %arg43, %59, %alloc_97, %alloc_98, %alloc_99) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_100 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_97, %9, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_101 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_97, %alloc_101) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %60 = call @Unknown108(%58, %alloc_100, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> + %alloc_102 = memref.alloc() : memref<4x128x28x28xf16> %alloc_103 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %53, %alloc_101, %alloc_102, %alloc_103) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_104 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_101, %9, %alloc_104) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_105 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %54 = call @Unknown110(%52, %alloc_104, %29#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> - %alloc_106 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_107 = memref.alloc() : memref<128xf32> + %alloc_104 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_14, %arg33, %60, %alloc_102, %alloc_103, %alloc_104) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_105 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_102, %8, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_106 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_102, %alloc_106) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %61 = call @Unknown112(%30#1, %alloc_105) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> + %alloc_107 = memref.alloc() : memref<4x128x28x28xf16> %alloc_108 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %54, %alloc_106, %alloc_107, %alloc_108) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_109 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_106, %8, %alloc_109) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_110 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %55 = call @Unknown114(%28#1, %alloc_109) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> - %alloc_111 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_112 = memref.alloc() : memref<128xf32> + %alloc_109 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_12, %arg28, %61, %alloc_107, %alloc_108, %alloc_109) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_110 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_107, %7, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_111 = memref.alloc() : memref<128x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_107, %alloc_111) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16> + %alloc_112 = memref.alloc() : memref<4x128x28x28xf16> %alloc_113 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %55, %alloc_111, %alloc_112, %alloc_113) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_114 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_111, %7, %alloc_114) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_115 = memref.alloc() : memref<128x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16> - %alloc_116 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_117 = memref.alloc() : memref<128xf32> - %alloc_118 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %54, %alloc_116, %alloc_117, %alloc_118) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_119 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_116, %6, %alloc_119) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16> - %alloc_120 = memref.alloc() : memref<128x64x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16> - %56 = call @Unknown121(%alloc_119, %alloc_114, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> - %alloc_121 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_122 = memref.alloc() : memref<64xf32> + %alloc_114 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_10, %arg38, %60, %alloc_112, %alloc_113, %alloc_114) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_115 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_112, %6, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16> + %alloc_116 = memref.alloc() : memref<128x64x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_112, %alloc_116) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16> + %62 = call @Unknown127(%alloc_115, %alloc_110, %29#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> + %alloc_117 = memref.alloc() : memref<4x64x56x56xf16> + %alloc_118 = memref.alloc() : memref<64xf32> + %alloc_119 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_8, %arg23, %62, %alloc_117, %alloc_118, %alloc_119) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_120 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_117, %5, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_121 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_117, %alloc_121) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %63 = call @Unknown131(%28#1, %alloc_120) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_122 = memref.alloc() : memref<4x64x56x56xf16> %alloc_123 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %56, %alloc_121, %alloc_122, %alloc_123) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_124 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_121, %5, %alloc_124) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_125 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %57 = call @Unknown125(%26#1, %alloc_124) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_126 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_127 = memref.alloc() : memref<64xf32> + %alloc_124 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_6, %arg18, %63, %alloc_122, %alloc_123, %alloc_124) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_125 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_122, %4, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_126 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_122, %alloc_126) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %64 = call @Unknown127(%62, %alloc_125, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> + %alloc_127 = memref.alloc() : memref<4x64x56x56xf16> %alloc_128 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %57, %alloc_126, %alloc_127, %alloc_128) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_129 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_126, %4, %alloc_129) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_130 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %58 = call @Unknown129(%56, %alloc_129, %25#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> - %alloc_131 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_132 = memref.alloc() : memref<64xf32> + %alloc_129 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_4, %arg13, %64, %alloc_127, %alloc_128, %alloc_129) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_130 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_127, %3, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_131 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_127, %alloc_131) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %65 = call @Unknown131(%26#1, %alloc_130) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_132 = memref.alloc() : memref<4x64x56x56xf16> %alloc_133 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %58, %alloc_131, %alloc_132, %alloc_133) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_134 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_131, %3, %alloc_134) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_135 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%24#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %59 = call @Unknown133(%24#1, %alloc_134) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_136 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_137 = memref.alloc() : memref<64xf32> - %alloc_138 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %59, %alloc_136, %alloc_137, %alloc_138) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_139 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_136, %2, %alloc_139) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_140 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_2, %alloc_136, %alloc_140) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %60 = call @Unknown137(%58, %alloc_139) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_141 = memref.alloc() : memref<4x64x112x112xf16> - byre.compute @PoolMaxGradOp_f16f16_f16(%23#0, %60, %alloc_141) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16> - %61 = call @Unknown138(%23#1, %alloc_141) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> - %alloc_142 = memref.alloc() : memref<4x64x112x112xf16> - %alloc_143 = memref.alloc() : memref<64xf32> - %alloc_144 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %61, %alloc_142, %alloc_143, %alloc_144) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32> - %alloc_145 = memref.alloc() : memref<64x3x7x7xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_142, %alloc_145) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16> - %alloc_146 = memref.alloc() : memref - byre.compute @ReduceSumOp_f32_f32(%43#1, %alloc_146) {dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref - %62 = call @Unknown141(%alloc_146) : (memref) -> memref - %63 = call @Unknown142(%alloc_145) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> - %64 = call @Unknown143(%alloc_140) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %65 = call @Unknown144(%alloc_135) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %66 = call @Unknown145(%alloc_130) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %67 = call @Unknown146(%alloc_125) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %68 = call @Unknown147(%alloc_115) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> - %69 = call @Unknown148(%alloc_110) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %70 = call @Unknown149(%alloc_120) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> - %71 = call @Unknown150(%alloc_105) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %72 = call @Unknown151(%alloc_100) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %73 = call @Unknown152(%alloc_90) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> - %74 = call @Unknown153(%alloc_85) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %75 = call @Unknown154(%alloc_95) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> - %76 = call @Unknown155(%alloc_80) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %77 = call @Unknown156(%alloc_75) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %78 = call @Unknown157(%alloc_65) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> - %79 = call @Unknown158(%alloc_60) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %80 = call @Unknown159(%alloc_70) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> - %81 = call @Unknown160(%alloc_55) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %82 = call @Unknown161(%alloc_50) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %alloc_147 = memref.alloc() : memref<1000x512xf16> - byre.compute @MatmulOp_f16f16_f16(%40, %43#0, %alloc_147) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16> - %83 = call @Unknown163(%alloc_147) : (memref<1000x512xf16>) -> memref<1000x512xf32> - %alloc_148 = memref.alloc() : memref<1000xf32> - byre.compute @ReduceSumOp_f32_f32(%43#2, %alloc_148) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<1000xf32> - %84 = call @Unknown164(%alloc_148) : (memref<1000xf32>) -> memref<1000xf32> - return %62, %63, %alloc_143, %alloc_144, %64, %alloc_137, %alloc_138, %65, %alloc_132, %alloc_133, %66, %alloc_127, %alloc_128, %67, %alloc_122, %alloc_123, %68, %alloc_112, %alloc_113, %69, %alloc_107, %alloc_108, %70, %alloc_117, %alloc_118, %71, %alloc_102, %alloc_103, %72, %alloc_97, %alloc_98, %73, %alloc_87, %alloc_88, %74, %alloc_82, %alloc_83, %75, %alloc_92, %alloc_93, %76, %alloc_77, %alloc_78, %77, %alloc_72, %alloc_73, %78, %alloc_62, %alloc_63, %79, %alloc_57, %alloc_58, %80, %alloc_67, %alloc_68, %81, %alloc_52, %alloc_53, %82, %alloc_47, %alloc_48, %83, %84 : memref, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32> + %alloc_134 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_2, %arg8, %65, %alloc_132, %alloc_133, %alloc_134) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_135 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_132, %2, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_136 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_1, %alloc_132, %alloc_136) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %66 = call @Unknown143(%64, %alloc_135) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_137 = memref.alloc() : memref<4x64x112x112xf16> + byre.compute @PoolMaxGradOp_f16f16_f16(%25#0, %66, %alloc_137) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16> + %67 = call @Unknown144(%25#1, %alloc_137) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> + %alloc_138 = memref.alloc() : memref<4x64x112x112xf16> + %alloc_139 = memref.alloc() : memref<64xf32> + %alloc_140 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %67, %alloc_138, %alloc_139, %alloc_140) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32> + %alloc_141 = memref.alloc() : memref<64x3x7x7xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_138, %alloc_141) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16> + %68 = call @Unknown147(%49#0, %arg1) : (memref<4x1000xf16>, memref<4x1000xf32>) -> memref + %69 = call @Unknown148(%68) : (memref) -> memref + %70 = call @Unknown149(%alloc_141) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> + %71 = call @Unknown150(%alloc_136) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %72 = call @Unknown150(%alloc_131) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %73 = call @Unknown150(%alloc_126) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %74 = call @Unknown150(%alloc_121) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %75 = call @Unknown154(%alloc_111) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> + %76 = call @Unknown155(%alloc_106) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %77 = call @Unknown156(%alloc_116) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> + %78 = call @Unknown155(%alloc_101) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %79 = call @Unknown155(%alloc_96) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %80 = call @Unknown159(%alloc_86) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> + %81 = call @Unknown160(%alloc_81) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %82 = call @Unknown161(%alloc_91) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> + %83 = call @Unknown160(%alloc_76) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %84 = call @Unknown160(%alloc_71) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %85 = call @Unknown164(%alloc_61) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> + %86 = call @Unknown165(%alloc_56) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %87 = call @Unknown166(%alloc_66) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> + %88 = call @Unknown165(%alloc_51) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %89 = call @Unknown165(%alloc_46) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %alloc_142 = memref.alloc() : memref<1000x512xf16> + byre.compute @MatmulOp_f16f16_f16(%43, %49#1, %alloc_142) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16> + %90 = call @Unknown170(%alloc_142) : (memref<1000x512xf16>) -> memref<1000x512xf32> + %91 = call @Unknown171(%49#1) : (memref<4x1000xf16>) -> memref<1000xf32> + %92 = call @Unknown172(%91) : (memref<1000xf32>) -> memref<1000xf32> + return %69, %70, %alloc_139, %alloc_140, %71, %alloc_133, %alloc_134, %72, %alloc_128, %alloc_129, %73, %alloc_123, %alloc_124, %74, %alloc_118, %alloc_119, %75, %alloc_108, %alloc_109, %76, %alloc_103, %alloc_104, %77, %alloc_113, %alloc_114, %78, %alloc_98, %alloc_99, %79, %alloc_93, %alloc_94, %80, %alloc_83, %alloc_84, %81, %alloc_78, %alloc_79, %82, %alloc_88, %alloc_89, %83, %alloc_73, %alloc_74, %84, %alloc_68, %alloc_69, %85, %alloc_58, %alloc_59, %86, %alloc_53, %alloc_54, %87, %alloc_63, %alloc_64, %88, %alloc_48, %alloc_49, %89, %alloc_43, %alloc_44, %90, %92 : memref, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/Whole/5_alternative_scf_opt.mlir b/compiler/test/E2E/ResNet18/Whole/5_alternative_scf_opt.mlir index 53e23b190..a0b85713c 100644 --- a/compiler/test/E2E/ResNet18/Whole/5_alternative_scf_opt.mlir +++ b/compiler/test/E2E/ResNet18/Whole/5_alternative_scf_opt.mlir @@ -2,676 +2,1631 @@ // CHECK-LABEL: func.func @main -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map1 = affine_map<(d0, d1) -> (d0, d1)> -#map2 = affine_map<(d0, d1) -> (d1)> -#map3 = affine_map<(d0, d1) -> (d0)> -#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -#map5 = affine_map<() -> ()> -#map6 = affine_map<(d0) -> (d0)> +#map = affine_map<() -> ()> +#map1 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024, 1000)> +#map2 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024 + 2, 1000)> +#map3 = affine_map<(d0, d1) -> (d0 - d1)> +#map4 = affine_map<(d0) -> (d0 * 2)> +#map5 = affine_map<(d0) -> (d0 * 2 + 1)> +#map6 = affine_map<(d0) -> (d0 mod 64, 49)> +#map7 = affine_map<(d0) -> (d0 mod 64 + 1, 49)> +#map8 = affine_map<(d0) -> (d0 mod 128, 125)> +#map9 = affine_map<(d0) -> (d0 mod 128 + 1, 125)> +#map10 = affine_map<(d0)[s0] -> (d0 * 32 + s0)> +#map11 = affine_map<(d0) -> (d0 * -32 + 1000, 32)> +#map12 = affine_map<(d0) -> (d0 * 32)> +#map13 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0)> +#map14 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0 + 1)> module @IrToMhlo.2452 { func.func private @Unknown0(%arg0: memref<4x3x224x224xf32>) -> memref<4x3x224x224xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c224 = arith.constant 224 : index %alloc = memref.alloc() : memref<4x3x224x224xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x3x224x224xf32>) outs(%alloc : memref<4x3x224x224xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c3 step %c1 { + scf.for %arg3 = %c0 to %c224 step %c1 { + scf.for %arg4 = %c0 to %c224 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x3x224x224xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x3x224x224xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x3x224x224xf16> } func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<64x3x7x7xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf32>) outs(%alloc : memref<64x3x7x7xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c3 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<64x3x7x7xf16> } func.func private @Unknown3(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown5(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown6(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<64x64x3x3xf16> } func.func private @Unknown7(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<128x64x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf32>) outs(%alloc : memref<128x64x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<128x64x1x1xf16> } func.func private @Unknown8(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x64x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf32>) outs(%alloc : memref<128x64x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<128x64x3x3xf16> } func.func private @Unknown9(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown10(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown11(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<128x128x3x3xf16> } func.func private @Unknown12(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<256x128x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf32>) outs(%alloc : memref<256x128x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<256x128x1x1xf16> } func.func private @Unknown13(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x128x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf32>) outs(%alloc : memref<256x128x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<256x128x3x3xf16> } func.func private @Unknown14(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown15(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown16(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<256x256x3x3xf16> } func.func private @Unknown17(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf32>) outs(%alloc : memref<512x256x1x1xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<512x256x1x1xf16> } func.func private @Unknown18(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x256x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf32>) outs(%alloc : memref<512x256x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<512x256x3x3xf16> } func.func private @Unknown19(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown20(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown21(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf16> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<512x512x3x3xf16> } func.func private @Unknown22(%arg0: memref<4x1000xf32>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant -2.500000e-01 : f32 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<4x1000xf16> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<4x1000xf32>) outs(%alloc : memref<4x1000xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.mulf %in, %cst : f32 - %1 = arith.truncf %0 : f32 to f16 - linalg.yield %1 : f16 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<4x1000xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.mulf %in, %cst : f32 + %1 = arith.truncf %0 : f32 to f16 + linalg.yield %1 : f16 + } + } } return %alloc : memref<4x1000xf16> } func.func private @Unknown23(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<1000x512xf16> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf32>) outs(%alloc : memref<1000x512xf16>) { - ^bb0(%in: f32, %out: f16): - %0 = arith.truncf %in : f32 to f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c1000 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<1000x512xf16> } - func.func private @Unknown24(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown24(%arg0: memref<1000xf32>) -> memref<1000xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %alloc = memref.alloc() : memref<1000xf16> + scf.for %arg1 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg1] [1] [1] : memref<1000xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f16): + %0 = arith.truncf %in : f32 to f16 + linalg.yield %0 : f16 + } + } + return %alloc : memref<1000xf16> + } + func.func private @Unknown25(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<4xf16> + scf.forall (%arg1) in (4) { + %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + scf.forall (%arg2) in (512) { + %0 = affine.min #map1(%arg2) + %1 = affine.min #map2(%arg2) + %2 = affine.apply #map3(%1, %0) + %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4 = scf.if %3 -> (f16) { + %9 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %9 : f16 + } else { + scf.yield %cst : f16 + } + %5 = arith.addf %4, %cst : f16 + %6 = arith.cmpi ugt, %2, %c1 : index + %7 = scf.if %6 -> (f16) { + %9 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %9 : f16 + } else { + scf.yield %cst : f16 + } + %8 = arith.addf %5, %7 : f16 + memref.store %8, %alloca[%arg2] : memref<512xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space> + scf.forall (%arg2) in (256) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space> + scf.forall (%arg2) in (128) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<4xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref<4xf16> + } + func.func private @Unknown26(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %alloc = memref.alloc() : memref<4x64x112x112xf16> %alloc_0 = memref.alloc() : memref<4x64x112x112xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x112x112xf16>) outs(%alloc, %alloc_0 : memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c112 step %c1 { + scf.for %arg4 = %c0 to %c112 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xi1> to memref> + %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %out: f16, %out_3: i1): + %0 = arith.maximumf %in, %cst : f16 + %1 = arith.cmpf ogt, %0, %cst : f16 + linalg.yield %0, %1 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x64x112x112xf16>, memref<4x64x112x112xi1> } - func.func private @Unknown26(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x64x56x56xf16> - %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> - } - func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c56 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref> + %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %out: f16, %out_3: i1): + %0 = arith.maximumf %in, %cst : f16 + %1 = arith.cmpf ogt, %0, %cst : f16 + linalg.yield %0, %1 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> } - func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c64 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + scf.for %arg5 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref> + %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref>, memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1): + %0 = arith.addf %in, %in_4 : f16 + %1 = arith.maximumf %0, %cst : f16 + %2 = arith.cmpf ogt, %1, %cst : f16 + linalg.yield %1, %2 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> } - func.func private @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x64x56x56xf16> - %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> - } - func.func private @Unknown35(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x128x28x28xf16> - %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> - } - func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c28 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref> + %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %out: f16, %out_3: i1): + %0 = arith.maximumf %in, %cst : f16 + %1 = arith.cmpf ogt, %0, %cst : f16 + linalg.yield %0, %1 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> } - func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c128 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + scf.for %arg5 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref> + %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref>, memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1): + %0 = arith.addf %in, %in_4 : f16 + %1 = arith.maximumf %0, %cst : f16 + %2 = arith.cmpf ogt, %1, %cst : f16 + linalg.yield %1, %2 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> } - func.func private @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x128x28x28xf16> - %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> - } - func.func private @Unknown44(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c14 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref> + %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %out: f16, %out_3: i1): + %0 = arith.maximumf %in, %cst : f16 + %1 = arith.cmpf ogt, %0, %cst : f16 + linalg.yield %0, %1 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> } - func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c256 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + scf.for %arg5 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref> + %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref>, memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1): + %0 = arith.addf %in, %in_4 : f16 + %1 = arith.maximumf %0, %cst : f16 + %2 = arith.cmpf ogt, %1, %cst : f16 + linalg.yield %1, %2 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> } - func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x256x14x14xf16> - %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> - } - func.func private @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x256x14x14xf16> - %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> - } - func.func private @Unknown53(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref> + %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %out: f16, %out_3: i1): + %0 = arith.maximumf %in, %cst : f16 + %1 = arith.cmpf ogt, %0, %cst : f16 + linalg.yield %0, %1 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> } - func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c512 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + scf.for %arg5 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref> + %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref>, memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1): + %0 = arith.addf %in, %in_4 : f16 + %1 = arith.maximumf %0, %cst : f16 + %2 = arith.cmpf ogt, %1, %cst : f16 + linalg.yield %1, %2 : f16, i1 + } + } + } + } } return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> } - func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown62(%arg0: memref<4x512x7x7xf16>) -> memref<4x512xf16> attributes {__byteir_reduction_fusion__} { %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x512x7x7xf16> - %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) { - ^bb0(%in: f16, %out: f16, %out_1: i1): - %0 = arith.maxnumf %in, %cst : f16 - %1 = arith.cmpf ogt, %0, %cst : f16 - linalg.yield %0, %1 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> - } - func.func private @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x512x7x7xf16> - %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1): - %0 = arith.addf %in, %in_1 : f16 - %1 = arith.maxnumf %0, %cst : f16 - %2 = arith.cmpf ogt, %1, %cst : f16 - linalg.yield %1, %2 : f16, i1 - } - return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> - } - func.func private @Unknown60(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<4x512x7x7xf16> into memref<2048x49xf16> + %alloc = memref.alloc() : memref<2048xf16> + scf.forall (%arg1) in (2048) { + %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = affine.min #map6(%arg2) + %1 = affine.min #map7(%arg2) + %2 = affine.apply #map3(%1, %0) + %subview_6 = memref.subview %expand_shape_0[0, %0] [1, %2] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4 = scf.if %3 -> (f16) { + %6 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %6 : f16 + } else { + scf.yield %cst : f16 + } + %5 = arith.addf %4, %cst : f16 + memref.store %5, %alloca[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<2048xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<2048xf16> into memref<4x512xf16> + return %expand_shape : memref<4x512xf16> + } + func.func private @Unknown63(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 2.040100e-02 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<4x512xf16> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<4x512xf16>) outs(%alloc : memref<4x512xf16>) { - ^bb0(%in: f16, %out: f16): - %0 = arith.mulf %in, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg1 = %c0 to %c4 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<4x512xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<4x512xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = arith.mulf %in, %cst : f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<4x512xf16> } - func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<4x1000xf16> - linalg.generic {indexing_maps = [#map1, #map2, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x1000xf16>, memref<1000xf32>) outs(%alloc : memref<4x1000xf16>) { - ^bb0(%in: f16, %in_0: f32, %out: f16): - %0 = arith.truncf %in_0 : f32 to f16 - %1 = arith.addf %in, %0 : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg3] [1] [1] : memref<1000xf16> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in_2, %in : f16 + linalg.yield %0 : f16 + } + } } return %alloc : memref<4x1000xf16> } - func.func private @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown65(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<4xf16> + scf.forall (%arg1) in (4) { + %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + scf.forall (%arg2) in (512) { + %0 = affine.min #map1(%arg2) + %1 = affine.min #map2(%arg2) + %2 = affine.apply #map3(%1, %0) + %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4 = scf.if %3 -> (f16) { + %8 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %8 : f16 + } else { + scf.yield %cst : f16 + } + %5 = arith.cmpi ugt, %2, %c1 : index + %6 = scf.if %5 -> (f16) { + %8 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %8 : f16 + } else { + scf.yield %cst : f16 + } + %7 = arith.maximumf %4, %6 : f16 + memref.store %7, %alloca[%arg2] : memref<512xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space> + scf.forall (%arg2) in (256) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space> + scf.forall (%arg2) in (128) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_0[%2] : memref<256xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_1[%2] : memref<128xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_2[%2] : memref<64xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_3[%2] : memref<32xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_4[%2] : memref<16xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_5[%2] : memref<8xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_6[%2] : memref<4xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space> + %2 = affine.apply #map5(%arg2) + %3 = memref.load %alloca_7[%2] : memref<2xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloc[%arg1] : memref<4xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref<4xf16> + } + func.func private @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<4x1000xf16> - %alloc_0 = memref.alloc() : memref<4x1000xf16> - linalg.generic {indexing_maps = [#map1, #map3, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x1000xf16>, memref<4xf16>) outs(%alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16>) { - ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: f16): - %0 = arith.subf %in, %in_1 : f16 - %1 = math.exp %0 : f16 - linalg.yield %0, %1 : f16, f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg2] [1] [1] : memref<4xf16> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.subf %in_2, %in : f16 + linalg.yield %0 : f16 + } + } } - return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16> + return %alloc : memref<4x1000xf16> } - func.func private @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown67(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<4xf16> + scf.forall (%arg1) in (4) { + %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + scf.forall (%arg2) in (512) { + %0 = affine.min #map1(%arg2) + %1 = affine.min #map2(%arg2) + %2 = affine.apply #map3(%1, %0) + %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4 = scf.if %3 -> (f16) { + %11 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %11 : f16 + } else { + scf.yield %cst : f16 + } + %5 = math.exp %4 : f16 + %6 = arith.addf %5, %cst : f16 + %7 = arith.cmpi ugt, %2, %c1 : index + %8 = scf.if %7 -> (f16) { + %11 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %11 : f16 + } else { + scf.yield %cst : f16 + } + %9 = math.exp %8 : f16 + %10 = arith.addf %6, %9 : f16 + memref.store %10, %alloca[%arg2] : memref<512xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space> + scf.forall (%arg2) in (256) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space> + scf.forall (%arg2) in (128) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = affine.apply #map4(%arg2) + %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = affine.apply #map5(%arg2) + %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<4xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref<4xf16> + } + func.func private @Unknown68(%arg0: memref<4xf16>) -> memref<4xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %alloc = memref.alloc() : memref<4xf16> + scf.for %arg1 = %c0 to %c4 step %c1 { + %subview = memref.subview %arg0[%arg1] [1] [1] : memref<4xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<4xf16> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f16): + %0 = math.log %in : f16 + linalg.yield %0 : f16 + } + } + return %alloc : memref<4xf16> + } + func.func private @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index %alloc = memref.alloc() : memref<4x1000xf16> - %alloc_0 = memref.alloc() : memref<4x1000xf32> - %alloc_1 = memref.alloc() : memref<4x1000xf32> - linalg.generic {indexing_maps = [#map1, #map1, #map3, #map3, #map1, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3, %arg1, %arg0, %arg2, %arg4 : memref<4x1000xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4xf16>, memref<4x1000xf32>) outs(%alloc, %alloc_0, %alloc_1 : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) { - ^bb0(%in: f16, %in_2: f16, %in_3: f16, %in_4: f16, %in_5: f32, %out: f16, %out_6: f32, %out_7: f32): - %0 = math.log %in_3 : f16 - %1 = arith.subf %in_2, %0 : f16 - %2 = math.exp %1 : f16 - %3 = arith.mulf %2, %in_4 : f16 - %4 = arith.subf %in, %3 : f16 - %5 = arith.extf %1 : f16 to f32 - %6 = arith.mulf %5, %in_5 : f32 - %7 = arith.extf %4 : f16 to f32 - linalg.yield %4, %6, %7 : f16, f32, f32 + %alloc_0 = memref.alloc() : memref<4x1000xf16> + scf.for %arg4 = %c0 to %c4 step %c1 { + scf.for %arg5 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg2[%arg4] [1] [1] : memref<4xf16> to memref> + %subview_1 = memref.subview %alloc_0[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + %subview_2 = memref.subview %alloc[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + %subview_3 = memref.subview %arg0[%arg4] [1] [1] : memref<4xf16> to memref> + %subview_4 = memref.subview %arg1[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + %subview_5 = memref.subview %arg3[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3, %subview_4, %subview_5 : memref>, memref>, memref>, memref>) outs(%subview_2, %subview_1 : memref>, memref>) { + ^bb0(%in: f16, %in_6: f16, %in_7: f16, %in_8: f16, %out: f16, %out_9: f16): + %0 = arith.subf %in_7, %in_6 : f16 + %1 = math.exp %0 : f16 + %2 = arith.mulf %1, %in : f16 + %3 = arith.subf %in_8, %2 : f16 + linalg.yield %0, %3 : f16, f16 + } + } } - return %alloc, %alloc_0, %alloc_1 : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32> + return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16> } - func.func private @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 4.900000e+01 : f16 %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map4, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x512x7x7xi1>, memref<4x512xf16>) outs(%alloc : memref<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_1: f16, %out: f16): - %0 = arith.divf %in_1, %cst : f16 - %1 = arith.select %in, %0, %cst_0 : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c512 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + scf.for %arg5 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3] [1, 1] [1, 1] : memref<4x512xf16> to memref> + %subview_1 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_2 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref>, memref>) outs(%subview_1 : memref>) { + ^bb0(%in: f16, %in_3: i1, %out: f16): + %0 = arith.divf %in, %cst : f16 + %1 = arith.select %in_3, %0, %cst_0 : f16 + linalg.yield %1 : f16 + } + } + } + } } return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c512 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + scf.for %arg5 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: i1, %in_2: f16, %out: f16): + %0 = arith.select %in, %in_2, %cst : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c512 step %c1 { + scf.for %arg5 = %c0 to %c7 step %c1 { + scf.for %arg6 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref> + %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.select %in_4, %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } + } } return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x512x7x7xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<4x512x7x7xf16> - } - func.func private @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<4x256x14x14xf16> - } - func.func private @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c256 step %c1 { + scf.for %arg5 = %c0 to %c14 step %c1 { + scf.for %arg6 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.select %in_4, %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } + } } return %alloc : memref<4x256x14x14xf16> } - func.func private @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c256 step %c1 { + scf.for %arg4 = %c0 to %c14 step %c1 { + scf.for %arg5 = %c0 to %c14 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: i1, %in_2: f16, %out: f16): + %0 = arith.select %in, %in_2, %cst : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x256x14x14xf16> } - func.func private @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x256x14x14xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<4x256x14x14xf16> - } - func.func private @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c128 step %c1 { + scf.for %arg5 = %c0 to %c28 step %c1 { + scf.for %arg6 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.select %in_4, %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } + } } return %alloc : memref<4x128x28x28xf16> } - func.func private @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c128 step %c1 { + scf.for %arg4 = %c0 to %c28 step %c1 { + scf.for %arg5 = %c0 to %c28 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: i1, %in_2: f16, %out: f16): + %0 = arith.select %in, %in_2, %cst : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x128x28x28xf16> } - func.func private @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 - } - return %alloc : memref<4x128x28x28xf16> - } - func.func private @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x128x28x28xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<4x128x28x28xf16> - } - func.func private @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg3 = %c0 to %c4 step %c1 { + scf.for %arg4 = %c0 to %c64 step %c1 { + scf.for %arg5 = %c0 to %c56 step %c1 { + scf.for %arg6 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref> + linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref>, memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16): + %0 = arith.addf %in, %in_3 : f16 + %1 = arith.select %in_4, %0, %cst : f16 + linalg.yield %1 : f16 + } + } + } + } } return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c64 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + scf.for %arg5 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: i1, %in_2: f16, %out: f16): + %0 = arith.select %in, %in_2, %cst : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + func.func private @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16): - %0 = arith.addf %in_0, %in_1 : f16 - %1 = arith.select %in, %0, %cst : f16 - linalg.yield %1 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c64 step %c1 { + scf.for %arg4 = %c0 to %c56 step %c1 { + scf.for %arg5 = %c0 to %c56 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %0 = arith.addf %in, %in_2 : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %alloc = memref.alloc() : memref<4x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<4x64x56x56xf16> - } - func.func private @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<4x64x56x56xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %0 = arith.addf %in, %in_0 : f16 - linalg.yield %0 : f16 - } - return %alloc : memref<4x64x56x56xf16> - } - func.func private @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %alloc = memref.alloc() : memref<4x64x112x112xf16> - linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) outs(%alloc : memref<4x64x112x112xf16>) { - ^bb0(%in: i1, %in_0: f16, %out: f16): - %0 = arith.select %in, %in_0, %cst : f16 - linalg.yield %0 : f16 + scf.for %arg2 = %c0 to %c4 step %c1 { + scf.for %arg3 = %c0 to %c64 step %c1 { + scf.for %arg4 = %c0 to %c112 step %c1 { + scf.for %arg5 = %c0 to %c112 step %c1 { + %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xi1> to memref> + %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref> + %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref> + linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref>, memref>) outs(%subview_0 : memref>) { + ^bb0(%in: i1, %in_2: f16, %out: f16): + %0 = arith.select %in, %in_2, %cst : f16 + linalg.yield %0 : f16 + } + } + } + } } return %alloc : memref<4x64x112x112xf16> } - func.func private @Unknown141(%arg0: memref) -> memref attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown147(%arg0: memref<4x1000xf16>, %arg1: memref<4x1000xf32>) -> memref attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref + %collapse_shape = memref.collapse_shape %arg0 [[0, 1]] : memref<4x1000xf16> into memref<4000xf16> + %collapse_shape_1 = memref.collapse_shape %arg1 [[0, 1]] : memref<4x1000xf32> into memref<4000xf32> + %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<4000xf16> into memref<32x125xf16> + %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] : memref<4000xf32> into memref<32x125xf32> + %alloc_3 = memref.alloc() : memref<32xf32> + scf.forall (%arg2) in (32) { + %subview = memref.subview %expand_shape[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>> + %expand_shape_4 = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>> + %subview_5 = memref.subview %expand_shape_2[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>> + %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>> + %alloca = memref.alloca() : memref<128xf32, #gpu.address_space> + scf.forall (%arg3) in (128) { + %0 = affine.min #map8(%arg3) + %1 = affine.min #map9(%arg3) + %2 = affine.apply #map3(%1, %0) + %subview_13 = memref.subview %expand_shape_4[0, %0] [1, %2] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref> + %expand_shape_14 = memref.expand_shape %subview_13 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %subview_15 = memref.subview %expand_shape_6[0, %0] [1, %2] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref> + %expand_shape_16 = memref.expand_shape %subview_15 [[0, 1]] : memref> into memref<1x?xf32, strided<[?, 1], offset: ?>> + %3 = arith.cmpi ugt, %2, %c0 : index + %4:2 = scf.if %3 -> (f16, f32) { + %8 = memref.load %expand_shape_14[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + %9 = memref.load %expand_shape_16[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>> + scf.yield %8, %9 : f16, f32 + } else { + scf.yield %cst_0, %cst : f16, f32 + } + %5 = arith.extf %4#0 : f16 to f32 + %6 = arith.mulf %5, %4#1 : f32 + %7 = arith.addf %6, %cst : f32 + memref.store %7, %alloca[%arg3] : memref<128xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space> + scf.forall (%arg3) in (64) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca[%0] : memref<128xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca[%3] : memref<128xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_7[%arg3] : memref<64xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space> + scf.forall (%arg3) in (32) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_7[%0] : memref<64xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_7[%3] : memref<64xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_8[%arg3] : memref<32xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space> + scf.forall (%arg3) in (16) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_8[%0] : memref<32xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_8[%3] : memref<32xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_9[%arg3] : memref<16xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space> + scf.forall (%arg3) in (8) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_9[%0] : memref<16xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_9[%3] : memref<16xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_10[%arg3] : memref<8xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space> + scf.forall (%arg3) in (4) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_10[%0] : memref<8xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_10[%3] : memref<8xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_11[%arg3] : memref<4xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space> + scf.forall (%arg3) in (2) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_11[%0] : memref<4xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_11[%3] : memref<4xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_12[%arg3] : memref<2xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg3) in (1) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_12[%0] : memref<2xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_12[%3] : memref<2xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloc_3[%arg2] : memref<32xf32> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + scf.forall (%arg2) in (1) { + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + scf.forall (%arg3) in (32) { + %0 = affine.apply #map10(%arg2)[%arg3] + %1 = memref.load %alloc_3[%0] : memref<32xf32> + %2 = arith.addf %1, %cst : f32 + memref.store %2, %alloca[%arg3] : memref<32xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf32, #gpu.address_space> + scf.forall (%arg3) in (16) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca[%0] : memref<32xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca[%3] : memref<32xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_4[%arg3] : memref<16xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf32, #gpu.address_space> + scf.forall (%arg3) in (8) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_4[%0] : memref<16xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_4[%3] : memref<16xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_5[%arg3] : memref<8xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf32, #gpu.address_space> + scf.forall (%arg3) in (4) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_5[%0] : memref<8xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_5[%3] : memref<8xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_6[%arg3] : memref<4xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf32, #gpu.address_space> + scf.forall (%arg3) in (2) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_6[%0] : memref<4xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_6[%3] : memref<4xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_7[%arg3] : memref<2xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg3) in (1) { + %0 = affine.apply #map4(%arg3) + %1 = memref.load %alloca_7[%0] : memref<2xf32, #gpu.address_space> + %2 = arith.addf %1, %cst : f32 + %3 = affine.apply #map5(%arg3) + %4 = memref.load %alloca_7[%3] : memref<2xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloc[] : memref + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref + } + func.func private @Unknown148(%arg0: memref) -> memref attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 4.000000e+00 : f32 %alloc = memref.alloc() : memref - linalg.generic {indexing_maps = [#map5, #map5], iterator_types = []} ins(%arg0 : memref) outs(%alloc : memref) { + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%arg0 : memref) outs(%alloc : memref) { ^bb0(%in: f32, %out: f32): %0 = arith.negf %in : f32 %1 = arith.divf %0, %cst : f32 @@ -679,202 +1634,335 @@ module @IrToMhlo.2452 { } return %alloc : memref } - func.func private @Unknown142(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown149(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %alloc = memref.alloc() : memref<64x3x7x7xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf16>) outs(%alloc : memref<64x3x7x7xf32>) attrs = {xla_shape = "f32[64,3,7,7]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c3 step %c1 { + scf.for %arg3 = %c0 to %c7 step %c1 { + scf.for %arg4 = %c0 to %c7 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<64x3x7x7xf32> } - func.func private @Unknown143(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown144(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown145(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown146(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown150(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<64x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs = {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<64x64x3x3xf32> } - func.func private @Unknown147(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown154(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x64x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf16>) outs(%alloc : memref<128x64x3x3xf32>) attrs = {xla_shape = "f32[128,64,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<128x64x3x3xf32> } - func.func private @Unknown148(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown155(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<128x128x3x3xf32> } - func.func private @Unknown149(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown156(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<128x64x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf16>) outs(%alloc : memref<128x64x1x1xf32>) attrs = {xla_shape = "f32[128,64,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c128 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<128x64x1x1xf32> } - func.func private @Unknown150(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown151(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<128x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs = {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown152(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown159(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x128x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf16>) outs(%alloc : memref<256x128x3x3xf32>) attrs = {xla_shape = "f32[256,128,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<256x128x3x3xf32> } - func.func private @Unknown153(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown160(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<256x256x3x3xf32> } - func.func private @Unknown154(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown161(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<256x128x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf16>) outs(%alloc : memref<256x128x1x1xf32>) attrs = {xla_shape = "f32[256,128,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c256 step %c1 { + scf.for %arg2 = %c0 to %c128 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<256x128x1x1xf32> } - func.func private @Unknown155(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown156(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<256x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs = {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown157(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown164(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x256x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf16>) outs(%alloc : memref<512x256x3x3xf32>) attrs = {xla_shape = "f32[512,256,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<512x256x3x3xf32> } - func.func private @Unknown158(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown165(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + scf.for %arg3 = %c0 to %c3 step %c1 { + scf.for %arg4 = %c0 to %c3 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } + } + } } return %alloc : memref<512x512x3x3xf32> } - func.func private @Unknown159(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown166(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf16>) outs(%alloc : memref<512x256x1x1xf32>) attrs = {xla_shape = "f32[512,256,1,1]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c512 step %c1 { + scf.for %arg2 = %c0 to %c256 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<512x256x1x1xf32> } - func.func private @Unknown160(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown161(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %alloc = memref.alloc() : memref<512x512x3x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs = {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 - } - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown163(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown170(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<1000x512xf32> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf16>) outs(%alloc : memref<1000x512xf32>) attrs = {xla_shape = "f32[1000,512]{0,1}"} { - ^bb0(%in: f16, %out: f32): - %0 = arith.extf %in : f16 to f32 - linalg.yield %0 : f32 + scf.for %arg1 = %c0 to %c1000 step %c1 { + scf.for %arg2 = %c0 to %c512 step %c1 { + %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref> + %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f16, %out: f32): + %0 = arith.extf %in : f16 to f32 + linalg.yield %0 : f32 + } + } } return %alloc : memref<1000x512xf32> } - func.func private @Unknown164(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown171(%arg0: memref<4x1000xf16>) -> memref<1000xf32> attributes {__byteir_reduction_fusion__} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<1000xf32> + scf.forall (%arg1) in (32) { + %0 = affine.min #map11(%arg1) + %1 = affine.apply #map12(%arg1) + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space> + scf.forall (%arg2, %arg3) in (2, 32) { + %2 = affine.min #map13(%arg3, %arg1) + %3 = affine.min #map14(%arg3, %arg1) + %4 = affine.apply #map3(%3, %2) + %5 = arith.cmpi ugt, %4, %c0 : index + %6 = scf.if %5 -> (f16) { + %12 = affine.apply #map4(%arg2) + %13 = affine.apply #map10(%arg1)[%2] + %14 = memref.load %arg0[%12, %13] : memref<4x1000xf16> + scf.yield %14 : f16 + } else { + scf.yield %cst_0 : f16 + } + %7 = arith.extf %6 : f16 to f32 + %8 = arith.addf %7, %cst : f32 + %9 = scf.if %5 -> (f16) { + %12 = affine.apply #map5(%arg2) + %13 = affine.apply #map10(%arg1)[%2] + %14 = memref.load %arg0[%12, %13] : memref<4x1000xf16> + scf.yield %14 : f16 + } else { + scf.yield %cst_0 : f16 + } + %10 = arith.extf %9 : f16 to f32 + %11 = arith.addf %8, %10 : f32 + memref.store %11, %alloca_1[%arg2, %arg3] : memref<2x32xf32, #gpu.address_space> + } {mapping = [#gpu.thread, #gpu.thread]} + scf.forall (%arg2) in (32) { + %2 = memref.load %alloca_1[%c0, %arg2] : memref<2x32xf32, #gpu.address_space> + %3 = arith.addf %2, %cst : f32 + %4 = memref.load %alloca_1[%c1, %arg2] : memref<2x32xf32, #gpu.address_space> + %5 = arith.addf %4, %3 : f32 + memref.store %5, %alloca[%arg2] : memref<32xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %subview = memref.subview %alloca[0] [%0] [1] : memref<32xf32, #gpu.address_space> to memref, #gpu.address_space> + %subview_2 = memref.subview %alloc[%1] [%0] [1] : memref<1000xf32> to memref> + memref.copy %subview, %subview_2 : memref, #gpu.address_space> to memref> + } {mapping = [#gpu.block]} + return %alloc : memref<1000xf32> + } + func.func private @Unknown172(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { + %c0 = arith.constant 0 : index + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index %alloc = memref.alloc() : memref<1000xf32> - linalg.generic {indexing_maps = [#map6, #map6], iterator_types = ["parallel"]} ins(%arg0 : memref<1000xf32>) outs(%alloc : memref<1000xf32>) { - ^bb0(%in: f32, %out: f32): - %0 = arith.truncf %in : f32 to f16 - %1 = arith.extf %0 : f16 to f32 - linalg.yield %1 : f32 + scf.for %arg1 = %c0 to %c1000 step %c1 { + %subview = memref.subview %arg0[%arg1] [1] [1] : memref<1000xf32> to memref> + %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<1000xf32> to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref>) outs(%subview_0 : memref>) { + ^bb0(%in: f32, %out: f32): + %0 = arith.truncf %in : f32 to f16 + %1 = arith.extf %0 : f16 to f32 + linalg.yield %1 : f32 + } } return %alloc : memref<1000xf32> } @@ -886,344 +1974,340 @@ module @IrToMhlo.2452 { %alloc_0 = memref.alloc() : memref<4x64x112x112xf16> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc, %arg3, %arg4, %alloc_0) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x112x112xf16> %2 = call @Unknown3(%arg7) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %3 = call @Unknown4(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %4 = call @Unknown5(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %5 = call @Unknown6(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %3 = call @Unknown3(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %4 = call @Unknown3(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %5 = call @Unknown3(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %6 = call @Unknown7(%arg37) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> %7 = call @Unknown8(%arg27) : (memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> %8 = call @Unknown9(%arg32) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> - %9 = call @Unknown10(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> - %10 = call @Unknown11(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %9 = call @Unknown9(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %10 = call @Unknown9(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %11 = call @Unknown12(%arg62) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> %12 = call @Unknown13(%arg52) : (memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> %13 = call @Unknown14(%arg57) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> - %14 = call @Unknown15(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> - %15 = call @Unknown16(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %14 = call @Unknown14(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %15 = call @Unknown14(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %16 = call @Unknown17(%arg87) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> %17 = call @Unknown18(%arg77) : (memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> %18 = call @Unknown19(%arg82) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> - %19 = call @Unknown20(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> - %20 = call @Unknown21(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %19 = call @Unknown19(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %20 = call @Unknown19(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %21 = call @Unknown22(%arg1) : (memref<4x1000xf32>) -> memref<4x1000xf16> %22 = call @Unknown23(%arg102) : (memref<1000x512xf32>) -> memref<1000x512xf16> - %alloc_1 = memref.alloc() : memref<4xf16> - byre.compute @ReduceSumOp_f16_f16(%21, %alloc_1) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %23:2 = call @Unknown24(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) + %23 = call @Unknown24(%arg103) : (memref<1000xf32>) -> memref<1000xf16> + %24 = call @Unknown25(%21) : (memref<4x1000xf16>) -> memref<4xf16> + %25:2 = call @Unknown26(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) + %alloc_1 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @PoolMaxOp_f16_f16(%25#0, %alloc_1) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16> %alloc_2 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @PoolMaxOp_f16_f16(%23#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16> + byre.compute @ConvOp_f16f16_f16(%alloc_1, %2, %alloc_2) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_3 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%alloc_2, %2, %alloc_3) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_2, %arg8, %arg9, %alloc_3) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %26:2 = call @Unknown28(%alloc_3) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_4 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_3, %arg8, %arg9, %alloc_4) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %24:2 = call @Unknown26(%alloc_4) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%26#0, %3, %alloc_4) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_5 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%24#0, %3, %alloc_5) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_4, %arg13, %arg14, %alloc_5) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %27:2 = call @Unknown30(%alloc_5, %alloc_1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_6 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_5, %arg13, %arg14, %alloc_6) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %25:2 = call @Unknown28(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%27#0, %4, %alloc_6) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_7 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%25#0, %4, %alloc_7) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_6, %arg18, %arg19, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %28:2 = call @Unknown28(%alloc_7) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_8 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_7, %arg18, %arg19, %alloc_8) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %26:2 = call @Unknown30(%alloc_8) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%28#0, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_9 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%26#0, %5, %alloc_9) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_10 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_9, %arg23, %arg24, %alloc_10) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %27:2 = call @Unknown32(%alloc_10, %25#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_8, %arg23, %arg24, %alloc_9) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %29:2 = call @Unknown30(%alloc_9, %27#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + %alloc_10 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvOp_f16f16_f16(%29#0, %6, %alloc_10) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16> %alloc_11 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%27#0, %6, %alloc_11) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_10, %arg38, %arg39, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> %alloc_12 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_11, %arg38, %arg39, %alloc_12) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + byre.compute @ConvOp_f16f16_f16(%29#0, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16> %alloc_13 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%27#0, %7, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_12, %arg28, %arg29, %alloc_13) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %30:2 = call @Unknown37(%alloc_13) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_14 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_13, %arg28, %arg29, %alloc_14) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %28:2 = call @Unknown35(%alloc_14) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%30#0, %8, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_15 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%28#0, %8, %alloc_15) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_14, %arg33, %arg34, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %31:2 = call @Unknown39(%alloc_15, %alloc_11) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_16 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_15, %arg33, %arg34, %alloc_16) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %29:2 = call @Unknown37(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%31#0, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_17 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%29#0, %9, %alloc_17) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_16, %arg43, %arg44, %alloc_17) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %32:2 = call @Unknown37(%alloc_17) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_18 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_17, %arg43, %arg44, %alloc_18) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %30:2 = call @Unknown39(%alloc_18) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%32#0, %10, %alloc_18) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_19 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%30#0, %10, %alloc_19) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_20 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_19, %arg48, %arg49, %alloc_20) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %31:2 = call @Unknown41(%alloc_20, %29#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_18, %arg48, %arg49, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %33:2 = call @Unknown39(%alloc_19, %31#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + %alloc_20 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvOp_f16f16_f16(%33#0, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16> %alloc_21 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%31#0, %11, %alloc_21) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_20, %arg63, %arg64, %alloc_21) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> %alloc_22 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_21, %arg63, %arg64, %alloc_22) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + byre.compute @ConvOp_f16f16_f16(%33#0, %12, %alloc_22) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16> %alloc_23 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%31#0, %12, %alloc_23) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_22, %arg53, %arg54, %alloc_23) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %34:2 = call @Unknown46(%alloc_23) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_24 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_23, %arg53, %arg54, %alloc_24) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %32:2 = call @Unknown44(%alloc_24) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%34#0, %13, %alloc_24) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_25 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%32#0, %13, %alloc_25) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_24, %arg58, %arg59, %alloc_25) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %35:2 = call @Unknown48(%alloc_25, %alloc_21) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_26 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_25, %arg58, %arg59, %alloc_26) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %33:2 = call @Unknown46(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%35#0, %14, %alloc_26) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_27 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%33#0, %14, %alloc_27) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_26, %arg68, %arg69, %alloc_27) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %36:2 = call @Unknown46(%alloc_27) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_28 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_27, %arg68, %arg69, %alloc_28) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %34:2 = call @Unknown48(%alloc_28) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%36#0, %15, %alloc_28) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_29 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%34#0, %15, %alloc_29) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_30 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_29, %arg73, %arg74, %alloc_30) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %35:2 = call @Unknown50(%alloc_30, %33#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_28, %arg73, %arg74, %alloc_29) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %37:2 = call @Unknown48(%alloc_29, %35#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + %alloc_30 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvOp_f16f16_f16(%37#0, %16, %alloc_30) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16> %alloc_31 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%35#0, %16, %alloc_31) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_30, %arg88, %arg89, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> %alloc_32 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_31, %arg88, %arg89, %alloc_32) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + byre.compute @ConvOp_f16f16_f16(%37#0, %17, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16> %alloc_33 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%35#0, %17, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_32, %arg78, %arg79, %alloc_33) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %38:2 = call @Unknown55(%alloc_33) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_34 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_33, %arg78, %arg79, %alloc_34) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %36:2 = call @Unknown53(%alloc_34) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%38#0, %18, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_35 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%36#0, %18, %alloc_35) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_34, %arg83, %arg84, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %39:2 = call @Unknown57(%alloc_35, %alloc_31) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_36 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_35, %arg83, %arg84, %alloc_36) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %37:2 = call @Unknown55(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%39#0, %19, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_37 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%37#0, %19, %alloc_37) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_36, %arg93, %arg94, %alloc_37) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %40:2 = call @Unknown55(%alloc_37) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_38 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_37, %arg93, %arg94, %alloc_38) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %38:2 = call @Unknown57(%alloc_38) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%40#0, %20, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_39 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%38#0, %20, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_40 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_39, %arg98, %arg99, %alloc_40) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %39:2 = call @Unknown59(%alloc_40, %37#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_38, %arg98, %arg99, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %41:2 = call @Unknown57(%alloc_39, %39#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + %42 = call @Unknown62(%41#0) : (memref<4x512x7x7xf16>) -> memref<4x512xf16> + %43 = call @Unknown63(%42) : (memref<4x512xf16>) -> memref<4x512xf16> + %alloc_40 = memref.alloc() : memref<4x1000xf16> + byre.compute @MatmulOp_f16f16_f16(%43, %22, %alloc_40) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16> + %44 = call @Unknown64(%23, %alloc_40) : (memref<1000xf16>, memref<4x1000xf16>) -> memref<4x1000xf16> + %45 = call @Unknown65(%44) : (memref<4x1000xf16>) -> memref<4xf16> + %46 = call @Unknown66(%45, %44) : (memref<4xf16>, memref<4x1000xf16>) -> memref<4x1000xf16> + %47 = call @Unknown67(%46) : (memref<4x1000xf16>) -> memref<4xf16> + %48 = call @Unknown68(%47) : (memref<4xf16>) -> memref<4xf16> + %49:2 = call @Unknown69(%48, %46, %24, %21) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) %alloc_41 = memref.alloc() : memref<4x512xf16> - byre.compute @ReduceSumOp_f16_f16(%39#0, %alloc_41) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<4x512xf16> - %40 = call @Unknown60(%alloc_41) : (memref<4x512xf16>) -> memref<4x512xf16> - %alloc_42 = memref.alloc() : memref<4x1000xf16> - byre.compute @MatmulOp_f16f16_f16(%40, %22, %alloc_42) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16> - %41 = call @Unknown61(%arg103, %alloc_42) : (memref<1000xf32>, memref<4x1000xf16>) -> memref<4x1000xf16> - %alloc_43 = memref.alloc() : memref<4xf16> - byre.compute @ReduceMaxOp_f16_f16(%41, %alloc_43) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %42:2 = call @Unknown62(%alloc_43, %41) : (memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) - %alloc_44 = memref.alloc() : memref<4xf16> - byre.compute @ReduceSumOp_f16_f16(%42#1, %alloc_44) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %43:3 = call @Unknown63(%alloc_44, %42#0, %alloc_1, %21, %arg1) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>, memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) - %alloc_45 = memref.alloc() : memref<4x512xf16> - byre.compute @MatmulOp_f16f16_f16(%43#0, %22, %alloc_45) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16> - %44 = call @Unknown64(%alloc_45, %39#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> - %alloc_46 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_47 = memref.alloc() : memref<512xf32> + byre.compute @MatmulOp_f16f16_f16(%49#1, %22, %alloc_41) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16> + %50 = call @Unknown70(%alloc_41, %41#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> + %alloc_42 = memref.alloc() : memref<4x512x7x7xf16> + %alloc_43 = memref.alloc() : memref<512xf32> + %alloc_44 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_38, %arg98, %50, %alloc_42, %alloc_43, %alloc_44) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_45 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_42, %20, %alloc_45) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_46 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%40#0, %alloc_42, %alloc_46) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %51 = call @Unknown74(%40#1, %alloc_45) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> + %alloc_47 = memref.alloc() : memref<4x512x7x7xf16> %alloc_48 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %44, %alloc_46, %alloc_47, %alloc_48) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_49 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_46, %20, %alloc_49) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_50 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %45 = call @Unknown68(%38#1, %alloc_49) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> - %alloc_51 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_52 = memref.alloc() : memref<512xf32> + %alloc_49 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_36, %arg93, %51, %alloc_47, %alloc_48, %alloc_49) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_50 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_47, %19, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_51 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%39#0, %alloc_47, %alloc_51) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %52 = call @Unknown78(%50, %alloc_50, %39#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> + %alloc_52 = memref.alloc() : memref<4x512x7x7xf16> %alloc_53 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %45, %alloc_51, %alloc_52, %alloc_53) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_54 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_51, %19, %alloc_54) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_55 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %46 = call @Unknown72(%44, %alloc_54, %37#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> - %alloc_56 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_57 = memref.alloc() : memref<512xf32> + %alloc_54 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_34, %arg83, %52, %alloc_52, %alloc_53, %alloc_54) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_55 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_52, %18, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_56 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_52, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %53 = call @Unknown74(%38#1, %alloc_55) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> + %alloc_57 = memref.alloc() : memref<4x512x7x7xf16> %alloc_58 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %46, %alloc_56, %alloc_57, %alloc_58) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_59 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_56, %18, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_60 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %47 = call @Unknown76(%36#1, %alloc_59) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> - %alloc_61 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_62 = memref.alloc() : memref<512xf32> + %alloc_59 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_32, %arg78, %53, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_60 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_57, %17, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_61 = memref.alloc() : memref<512x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_57, %alloc_61) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16> + %alloc_62 = memref.alloc() : memref<4x512x7x7xf16> %alloc_63 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %47, %alloc_61, %alloc_62, %alloc_63) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_64 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_61, %17, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_65 = memref.alloc() : memref<512x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16> - %alloc_66 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_67 = memref.alloc() : memref<512xf32> - %alloc_68 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %46, %alloc_66, %alloc_67, %alloc_68) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_69 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_66, %16, %alloc_69) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16> - %alloc_70 = memref.alloc() : memref<512x256x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16> - %48 = call @Unknown83(%alloc_69, %alloc_64, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> - %alloc_71 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_72 = memref.alloc() : memref<256xf32> + %alloc_64 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_30, %arg88, %52, %alloc_62, %alloc_63, %alloc_64) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_65 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_62, %16, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16> + %alloc_66 = memref.alloc() : memref<512x256x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_62, %alloc_66) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16> + %54 = call @Unknown89(%alloc_65, %alloc_60, %37#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> + %alloc_67 = memref.alloc() : memref<4x256x14x14xf16> + %alloc_68 = memref.alloc() : memref<256xf32> + %alloc_69 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_28, %arg73, %54, %alloc_67, %alloc_68, %alloc_69) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_70 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_67, %15, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_71 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_67, %alloc_71) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %55 = call @Unknown93(%36#1, %alloc_70) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> + %alloc_72 = memref.alloc() : memref<4x256x14x14xf16> %alloc_73 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %48, %alloc_71, %alloc_72, %alloc_73) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_74 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_71, %15, %alloc_74) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_75 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %49 = call @Unknown87(%34#1, %alloc_74) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> - %alloc_76 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_77 = memref.alloc() : memref<256xf32> + %alloc_74 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_26, %arg68, %55, %alloc_72, %alloc_73, %alloc_74) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_75 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_72, %14, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_76 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_72, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %56 = call @Unknown89(%54, %alloc_75, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> + %alloc_77 = memref.alloc() : memref<4x256x14x14xf16> %alloc_78 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %49, %alloc_76, %alloc_77, %alloc_78) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_79 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_76, %14, %alloc_79) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_80 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %50 = call @Unknown91(%48, %alloc_79, %33#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> - %alloc_81 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_82 = memref.alloc() : memref<256xf32> + %alloc_79 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_24, %arg58, %56, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_80 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_77, %13, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_81 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_77, %alloc_81) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %57 = call @Unknown93(%34#1, %alloc_80) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> + %alloc_82 = memref.alloc() : memref<4x256x14x14xf16> %alloc_83 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %50, %alloc_81, %alloc_82, %alloc_83) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_84 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_81, %13, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_85 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %51 = call @Unknown95(%32#1, %alloc_84) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> - %alloc_86 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_87 = memref.alloc() : memref<256xf32> + %alloc_84 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_22, %arg53, %57, %alloc_82, %alloc_83, %alloc_84) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_85 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_82, %12, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_86 = memref.alloc() : memref<256x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_82, %alloc_86) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16> + %alloc_87 = memref.alloc() : memref<4x256x14x14xf16> %alloc_88 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %51, %alloc_86, %alloc_87, %alloc_88) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_89 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_86, %12, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_90 = memref.alloc() : memref<256x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16> - %alloc_91 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_92 = memref.alloc() : memref<256xf32> - %alloc_93 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %50, %alloc_91, %alloc_92, %alloc_93) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_94 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_91, %11, %alloc_94) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16> - %alloc_95 = memref.alloc() : memref<256x128x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16> - %52 = call @Unknown102(%alloc_94, %alloc_89, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> - %alloc_96 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_97 = memref.alloc() : memref<128xf32> + %alloc_89 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_20, %arg63, %56, %alloc_87, %alloc_88, %alloc_89) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_90 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_87, %11, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16> + %alloc_91 = memref.alloc() : memref<256x128x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_87, %alloc_91) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16> + %58 = call @Unknown108(%alloc_90, %alloc_85, %33#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> + %alloc_92 = memref.alloc() : memref<4x128x28x28xf16> + %alloc_93 = memref.alloc() : memref<128xf32> + %alloc_94 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_18, %arg48, %58, %alloc_92, %alloc_93, %alloc_94) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_95 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_92, %10, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_96 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_92, %alloc_96) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %59 = call @Unknown112(%32#1, %alloc_95) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> + %alloc_97 = memref.alloc() : memref<4x128x28x28xf16> %alloc_98 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %52, %alloc_96, %alloc_97, %alloc_98) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_99 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_96, %10, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_100 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %53 = call @Unknown106(%30#1, %alloc_99) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> - %alloc_101 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_102 = memref.alloc() : memref<128xf32> + %alloc_99 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_16, %arg43, %59, %alloc_97, %alloc_98, %alloc_99) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_100 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_97, %9, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_101 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_97, %alloc_101) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %60 = call @Unknown108(%58, %alloc_100, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> + %alloc_102 = memref.alloc() : memref<4x128x28x28xf16> %alloc_103 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %53, %alloc_101, %alloc_102, %alloc_103) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_104 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_101, %9, %alloc_104) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_105 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %54 = call @Unknown110(%52, %alloc_104, %29#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> - %alloc_106 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_107 = memref.alloc() : memref<128xf32> + %alloc_104 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_14, %arg33, %60, %alloc_102, %alloc_103, %alloc_104) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_105 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_102, %8, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_106 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_102, %alloc_106) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %61 = call @Unknown112(%30#1, %alloc_105) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> + %alloc_107 = memref.alloc() : memref<4x128x28x28xf16> %alloc_108 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %54, %alloc_106, %alloc_107, %alloc_108) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_109 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_106, %8, %alloc_109) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_110 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %55 = call @Unknown114(%28#1, %alloc_109) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> - %alloc_111 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_112 = memref.alloc() : memref<128xf32> + %alloc_109 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_12, %arg28, %61, %alloc_107, %alloc_108, %alloc_109) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_110 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_107, %7, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_111 = memref.alloc() : memref<128x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_107, %alloc_111) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16> + %alloc_112 = memref.alloc() : memref<4x128x28x28xf16> %alloc_113 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %55, %alloc_111, %alloc_112, %alloc_113) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_114 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_111, %7, %alloc_114) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_115 = memref.alloc() : memref<128x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16> - %alloc_116 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_117 = memref.alloc() : memref<128xf32> - %alloc_118 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %54, %alloc_116, %alloc_117, %alloc_118) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_119 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_116, %6, %alloc_119) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16> - %alloc_120 = memref.alloc() : memref<128x64x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16> - %56 = call @Unknown121(%alloc_119, %alloc_114, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> - %alloc_121 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_122 = memref.alloc() : memref<64xf32> + %alloc_114 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_10, %arg38, %60, %alloc_112, %alloc_113, %alloc_114) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_115 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_112, %6, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16> + %alloc_116 = memref.alloc() : memref<128x64x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_112, %alloc_116) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16> + %62 = call @Unknown127(%alloc_115, %alloc_110, %29#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> + %alloc_117 = memref.alloc() : memref<4x64x56x56xf16> + %alloc_118 = memref.alloc() : memref<64xf32> + %alloc_119 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_8, %arg23, %62, %alloc_117, %alloc_118, %alloc_119) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_120 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_117, %5, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_121 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_117, %alloc_121) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %63 = call @Unknown131(%28#1, %alloc_120) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_122 = memref.alloc() : memref<4x64x56x56xf16> %alloc_123 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %56, %alloc_121, %alloc_122, %alloc_123) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_124 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_121, %5, %alloc_124) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_125 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %57 = call @Unknown125(%26#1, %alloc_124) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_126 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_127 = memref.alloc() : memref<64xf32> + %alloc_124 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_6, %arg18, %63, %alloc_122, %alloc_123, %alloc_124) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_125 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_122, %4, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_126 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_122, %alloc_126) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %64 = call @Unknown127(%62, %alloc_125, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> + %alloc_127 = memref.alloc() : memref<4x64x56x56xf16> %alloc_128 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %57, %alloc_126, %alloc_127, %alloc_128) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_129 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_126, %4, %alloc_129) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_130 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %58 = call @Unknown129(%56, %alloc_129, %25#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> - %alloc_131 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_132 = memref.alloc() : memref<64xf32> + %alloc_129 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_4, %arg13, %64, %alloc_127, %alloc_128, %alloc_129) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_130 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_127, %3, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_131 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_127, %alloc_131) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %65 = call @Unknown131(%26#1, %alloc_130) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_132 = memref.alloc() : memref<4x64x56x56xf16> %alloc_133 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %58, %alloc_131, %alloc_132, %alloc_133) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_134 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_131, %3, %alloc_134) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_135 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%24#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %59 = call @Unknown133(%24#1, %alloc_134) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_136 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_137 = memref.alloc() : memref<64xf32> - %alloc_138 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %59, %alloc_136, %alloc_137, %alloc_138) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_139 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_136, %2, %alloc_139) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_140 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_2, %alloc_136, %alloc_140) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %60 = call @Unknown137(%58, %alloc_139) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_141 = memref.alloc() : memref<4x64x112x112xf16> - byre.compute @PoolMaxGradOp_f16f16_f16(%23#0, %60, %alloc_141) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16> - %61 = call @Unknown138(%23#1, %alloc_141) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> - %alloc_142 = memref.alloc() : memref<4x64x112x112xf16> - %alloc_143 = memref.alloc() : memref<64xf32> - %alloc_144 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %61, %alloc_142, %alloc_143, %alloc_144) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32> - %alloc_145 = memref.alloc() : memref<64x3x7x7xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_142, %alloc_145) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16> - %alloc_146 = memref.alloc() : memref - byre.compute @ReduceSumOp_f32_f32(%43#1, %alloc_146) {dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref - %62 = call @Unknown141(%alloc_146) : (memref) -> memref - %63 = call @Unknown142(%alloc_145) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> - %64 = call @Unknown143(%alloc_140) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %65 = call @Unknown144(%alloc_135) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %66 = call @Unknown145(%alloc_130) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %67 = call @Unknown146(%alloc_125) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %68 = call @Unknown147(%alloc_115) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> - %69 = call @Unknown148(%alloc_110) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %70 = call @Unknown149(%alloc_120) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> - %71 = call @Unknown150(%alloc_105) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %72 = call @Unknown151(%alloc_100) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %73 = call @Unknown152(%alloc_90) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> - %74 = call @Unknown153(%alloc_85) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %75 = call @Unknown154(%alloc_95) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> - %76 = call @Unknown155(%alloc_80) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %77 = call @Unknown156(%alloc_75) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %78 = call @Unknown157(%alloc_65) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> - %79 = call @Unknown158(%alloc_60) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %80 = call @Unknown159(%alloc_70) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> - %81 = call @Unknown160(%alloc_55) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %82 = call @Unknown161(%alloc_50) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %alloc_147 = memref.alloc() : memref<1000x512xf16> - byre.compute @MatmulOp_f16f16_f16(%40, %43#0, %alloc_147) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16> - %83 = call @Unknown163(%alloc_147) : (memref<1000x512xf16>) -> memref<1000x512xf32> - %alloc_148 = memref.alloc() : memref<1000xf32> - byre.compute @ReduceSumOp_f32_f32(%43#2, %alloc_148) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<1000xf32> - %84 = call @Unknown164(%alloc_148) : (memref<1000xf32>) -> memref<1000xf32> - return %62, %63, %alloc_143, %alloc_144, %64, %alloc_137, %alloc_138, %65, %alloc_132, %alloc_133, %66, %alloc_127, %alloc_128, %67, %alloc_122, %alloc_123, %68, %alloc_112, %alloc_113, %69, %alloc_107, %alloc_108, %70, %alloc_117, %alloc_118, %71, %alloc_102, %alloc_103, %72, %alloc_97, %alloc_98, %73, %alloc_87, %alloc_88, %74, %alloc_82, %alloc_83, %75, %alloc_92, %alloc_93, %76, %alloc_77, %alloc_78, %77, %alloc_72, %alloc_73, %78, %alloc_62, %alloc_63, %79, %alloc_57, %alloc_58, %80, %alloc_67, %alloc_68, %81, %alloc_52, %alloc_53, %82, %alloc_47, %alloc_48, %83, %84 : memref, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32> + %alloc_134 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_2, %arg8, %65, %alloc_132, %alloc_133, %alloc_134) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_135 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_132, %2, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_136 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_1, %alloc_132, %alloc_136) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %66 = call @Unknown143(%64, %alloc_135) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_137 = memref.alloc() : memref<4x64x112x112xf16> + byre.compute @PoolMaxGradOp_f16f16_f16(%25#0, %66, %alloc_137) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16> + %67 = call @Unknown144(%25#1, %alloc_137) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> + %alloc_138 = memref.alloc() : memref<4x64x112x112xf16> + %alloc_139 = memref.alloc() : memref<64xf32> + %alloc_140 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %67, %alloc_138, %alloc_139, %alloc_140) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32> + %alloc_141 = memref.alloc() : memref<64x3x7x7xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_138, %alloc_141) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16> + %68 = call @Unknown147(%49#0, %arg1) : (memref<4x1000xf16>, memref<4x1000xf32>) -> memref + %69 = call @Unknown148(%68) : (memref) -> memref + %70 = call @Unknown149(%alloc_141) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> + %71 = call @Unknown150(%alloc_136) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %72 = call @Unknown150(%alloc_131) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %73 = call @Unknown150(%alloc_126) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %74 = call @Unknown150(%alloc_121) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %75 = call @Unknown154(%alloc_111) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> + %76 = call @Unknown155(%alloc_106) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %77 = call @Unknown156(%alloc_116) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> + %78 = call @Unknown155(%alloc_101) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %79 = call @Unknown155(%alloc_96) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %80 = call @Unknown159(%alloc_86) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> + %81 = call @Unknown160(%alloc_81) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %82 = call @Unknown161(%alloc_91) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> + %83 = call @Unknown160(%alloc_76) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %84 = call @Unknown160(%alloc_71) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %85 = call @Unknown164(%alloc_61) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> + %86 = call @Unknown165(%alloc_56) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %87 = call @Unknown166(%alloc_66) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> + %88 = call @Unknown165(%alloc_51) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %89 = call @Unknown165(%alloc_46) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %alloc_142 = memref.alloc() : memref<1000x512xf16> + byre.compute @MatmulOp_f16f16_f16(%43, %49#1, %alloc_142) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16> + %90 = call @Unknown170(%alloc_142) : (memref<1000x512xf16>) -> memref<1000x512xf32> + %91 = call @Unknown171(%49#1) : (memref<4x1000xf16>) -> memref<1000xf32> + %92 = call @Unknown172(%91) : (memref<1000xf32>) -> memref<1000xf32> + return %69, %70, %alloc_139, %alloc_140, %71, %alloc_133, %alloc_134, %72, %alloc_128, %alloc_129, %73, %alloc_123, %alloc_124, %74, %alloc_118, %alloc_119, %75, %alloc_108, %alloc_109, %76, %alloc_103, %alloc_104, %77, %alloc_113, %alloc_114, %78, %alloc_98, %alloc_99, %79, %alloc_93, %alloc_94, %80, %alloc_83, %alloc_84, %81, %alloc_78, %alloc_79, %82, %alloc_88, %alloc_89, %83, %alloc_73, %alloc_74, %84, %alloc_68, %alloc_69, %85, %alloc_58, %alloc_59, %86, %alloc_53, %alloc_54, %87, %alloc_63, %alloc_64, %88, %alloc_48, %alloc_49, %89, %alloc_43, %alloc_44, %90, %92 : memref, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/Whole/6_gpu_opt.mlir b/compiler/test/E2E/ResNet18/Whole/6_gpu_opt.mlir index 121c21a6b..ad5504a74 100644 --- a/compiler/test/E2E/ResNet18/Whole/6_gpu_opt.mlir +++ b/compiler/test/E2E/ResNet18/Whole/6_gpu_opt.mlir @@ -4,2766 +4,1530 @@ module @IrToMhlo.2452 { func.func private @Unknown0(%arg0: memref<4x3x224x224xf32>) -> memref<4x3x224x224xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c602112 = arith.constant 602112 : index - %c1 = arith.constant 1 : index %c224 = arith.constant 224 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c602112 = arith.constant 602112 : index %alloc = memref.alloc() : memref<4x3x224x224xf16> scf.for %arg1 = %c0 to %c602112 step %c1 { %0 = arith.remsi %arg1, %c224 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c224 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c224 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c224 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c224 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c224 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c3 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c3 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c3 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x3x224x224xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x3x224x224xf16> + %1 = arith.divsi %arg1, %c224 : index + %2 = arith.remsi %1, %c224 : index + %3 = arith.divsi %1, %c224 : index + %4 = arith.remsi %3, %c3 : index + %5 = arith.divsi %3, %c3 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x3x224x224xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x3x224x224xf16> } return %alloc : memref<4x3x224x224xf16> } func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c1 = arith.constant 1 : index %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c9408 = arith.constant 9408 : index %alloc = memref.alloc() : memref<64x3x7x7xf16> scf.for %arg1 = %c0 to %c9408 step %c1 { %0 = arith.remsi %arg1, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c3 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c3 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c3 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x3x7x7xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x3x7x7xf16> + %1 = arith.divsi %arg1, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = arith.remsi %3, %c3 : index + %5 = arith.divsi %3, %c3 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x3x7x7xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x3x7x7xf16> } return %alloc : memref<64x3x7x7xf16> } func.func private @Unknown3(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16> - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16> - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown5(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16> - } - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown6(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<64x64x3x3xf16> scf.for %arg1 = %c0 to %c36864 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x64x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x64x3x3xf16> } return %alloc : memref<64x64x3x3xf16> } func.func private @Unknown7(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<128x64x1x1xf16> scf.for %arg1 = %c0 to %c8192 step %c1 { %0 = arith.remsi %arg1, %c64 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c64 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c64 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<128x64x1x1xf32> - %11 = arith.truncf %10 : f32 to f16 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<128x64x1x1xf16> + %1 = arith.divsi %arg1, %c64 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<128x64x1x1xf32> + %3 = arith.truncf %2 : f32 to f16 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<128x64x1x1xf16> } return %alloc : memref<128x64x1x1xf16> } func.func private @Unknown8(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c73728 = arith.constant 73728 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c73728 = arith.constant 73728 : index %alloc = memref.alloc() : memref<128x64x3x3xf16> scf.for %arg1 = %c0 to %c73728 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x64x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x64x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x64x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x64x3x3xf16> } return %alloc : memref<128x64x3x3xf16> } func.func private @Unknown9(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<128x128x3x3xf16> - scf.for %arg1 = %c0 to %c147456 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16> - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown10(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<128x128x3x3xf16> - scf.for %arg1 = %c0 to %c147456 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16> - } - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown11(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c147456 = arith.constant 147456 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<128x128x3x3xf16> scf.for %arg1 = %c0 to %c147456 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x128x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x128x3x3xf16> } return %alloc : memref<128x128x3x3xf16> } func.func private @Unknown12(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c32768 = arith.constant 32768 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<256x128x1x1xf16> scf.for %arg1 = %c0 to %c32768 step %c1 { %0 = arith.remsi %arg1, %c128 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c128 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c128 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<256x128x1x1xf32> - %11 = arith.truncf %10 : f32 to f16 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<256x128x1x1xf16> + %1 = arith.divsi %arg1, %c128 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<256x128x1x1xf32> + %3 = arith.truncf %2 : f32 to f16 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<256x128x1x1xf16> } return %alloc : memref<256x128x1x1xf16> } func.func private @Unknown13(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c294912 = arith.constant 294912 : index %alloc = memref.alloc() : memref<256x128x3x3xf16> scf.for %arg1 = %c0 to %c294912 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x128x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x128x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x128x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x128x3x3xf16> } return %alloc : memref<256x128x3x3xf16> } func.func private @Unknown14(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<256x256x3x3xf16> - scf.for %arg1 = %c0 to %c589824 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16> - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown15(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<256x256x3x3xf16> - scf.for %arg1 = %c0 to %c589824 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16> - } - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown16(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c589824 = arith.constant 589824 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x256x3x3xf16> scf.for %arg1 = %c0 to %c589824 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x256x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x256x3x3xf16> } return %alloc : memref<256x256x3x3xf16> } func.func private @Unknown17(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c131072 = arith.constant 131072 : index - %c1 = arith.constant 1 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<512x256x1x1xf16> scf.for %arg1 = %c0 to %c131072 step %c1 { %0 = arith.remsi %arg1, %c256 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c256 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c256 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<512x256x1x1xf32> - %11 = arith.truncf %10 : f32 to f16 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<512x256x1x1xf16> + %1 = arith.divsi %arg1, %c256 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<512x256x1x1xf32> + %3 = arith.truncf %2 : f32 to f16 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<512x256x1x1xf16> } return %alloc : memref<512x256x1x1xf16> } func.func private @Unknown18(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c1179648 = arith.constant 1179648 : index %alloc = memref.alloc() : memref<512x256x3x3xf16> scf.for %arg1 = %c0 to %c1179648 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x256x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x256x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x256x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x256x3x3xf16> } return %alloc : memref<512x256x3x3xf16> } func.func private @Unknown19(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<512x512x3x3xf16> - scf.for %arg1 = %c0 to %c2359296 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16> - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown20(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<512x512x3x3xf16> - scf.for %arg1 = %c0 to %c2359296 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16> - } - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown21(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c2359296 = arith.constant 2359296 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<512x512x3x3xf16> scf.for %arg1 = %c0 to %c2359296 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32> - %31 = arith.truncf %30 : f32 to f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c512 : index + %5 = arith.divsi %3, %c512 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x512x3x3xf32> + %7 = arith.truncf %6 : f32 to f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x512x3x3xf16> } return %alloc : memref<512x512x3x3xf16> } func.func private @Unknown22(%arg0: memref<4x1000xf32>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant -2.500000e-01 : f32 + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index + %cst = arith.constant -2.500000e-01 : f32 %c4000 = arith.constant 4000 : index - %c1 = arith.constant 1 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<4x1000xf16> scf.for %arg1 = %c0 to %c4000 step %c1 { %0 = arith.remsi %arg1, %c1000 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c1000 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c1000 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3] : memref<4x1000xf32> - %11 = arith.mulf %10, %cst : f32 - %12 = arith.truncf %11 : f32 to f16 - memref.store %12, %alloc[%9, %3] : memref<4x1000xf16> + %1 = arith.divsi %arg1, %c1000 : index + %2 = memref.load %arg0[%1, %0] : memref<4x1000xf32> + %3 = arith.mulf %2, %cst : f32 + %4 = arith.truncf %3 : f32 to f16 + memref.store %4, %alloc[%1, %0] : memref<4x1000xf16> } return %alloc : memref<4x1000xf16> } func.func private @Unknown23(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} { + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c512000 = arith.constant 512000 : index - %c1 = arith.constant 1 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1000x512xf16> scf.for %arg1 = %c0 to %c512000 step %c1 { %0 = arith.remsi %arg1, %c512 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c512 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c512 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3] : memref<1000x512xf32> - %11 = arith.truncf %10 : f32 to f16 - memref.store %11, %alloc[%9, %3] : memref<1000x512xf16> + %1 = arith.divsi %arg1, %c512 : index + %2 = memref.load %arg0[%1, %0] : memref<1000x512xf32> + %3 = arith.truncf %2 : f32 to f16 + memref.store %3, %alloc[%1, %0] : memref<1000x512xf16> } return %alloc : memref<1000x512xf16> } - func.func private @Unknown24(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 + func.func private @Unknown24(%arg0: memref<1000xf32>) -> memref<1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<1000xf16> + scf.for %arg1 = %c0 to %c1000 step %c1 { + %0 = memref.load %arg0[%arg1] : memref<1000xf32> + %1 = arith.truncf %0 : f32 to f16 + memref.store %1, %alloc[%arg1] : memref<1000xf16> + } + return %alloc : memref<1000xf16> + } + func.func private @Unknown25(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { %c0 = arith.constant 0 : index - %c3211264 = arith.constant 3211264 : index %c1 = arith.constant 1 : index - %c112 = arith.constant 112 : index + %cst = arith.constant 0.000000e+00 : f16 + %c2 = arith.constant 2 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index + %c-1024 = arith.constant -1024 : index + %c1000 = arith.constant 1000 : index + %alloc = memref.alloc() : memref<4xf16> + scf.forall (%arg1) in (4) { + %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + scf.forall (%arg2) in (512) { + %0 = arith.muli %arg2, %c2 : index + %1 = arith.cmpi slt, %arg2, %c0 : index + %2 = arith.subi %c-1, %arg2 : index + %3 = arith.select %1, %2, %arg2 : index + %4 = arith.divsi %3, %c512 : index + %5 = arith.subi %c-1, %4 : index + %6 = arith.select %1, %5, %4 : index + %7 = arith.muli %6, %c-1024 : index + %8 = arith.addi %0, %7 : index + %9 = arith.cmpi slt, %8, %c1000 : index + %10 = arith.select %9, %8, %c1000 : index + %11 = arith.addi %8, %c2 : index + %12 = arith.cmpi slt, %11, %c1000 : index + %13 = arith.select %12, %11, %c1000 : index + %14 = arith.subi %13, %10 : index + %subview_8 = memref.subview %expand_shape[0, %10] [1, %14] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %15 = arith.cmpi ugt, %14, %c0 : index + %16 = scf.if %15 -> (f16) { + %21 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %21 : f16 + } else { + scf.yield %cst : f16 + } + %17 = arith.addf %16, %cst : f16 + %18 = arith.cmpi ugt, %14, %c1 : index + %19 = scf.if %18 -> (f16) { + %21 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %21 : f16 + } else { + scf.yield %cst : f16 + } + %20 = arith.addf %17, %19 : f16 + memref.store %20, %alloca[%arg2] : memref<512xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space> + scf.forall (%arg2) in (256) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space> + scf.forall (%arg2) in (128) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<4xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref<4xf16> + } + func.func private @Unknown26(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} { + %c112 = arith.constant 112 : index %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c3211264 = arith.constant 3211264 : index %alloc = memref.alloc() : memref<4x64x112x112xf16> %alloc_0 = memref.alloc() : memref<4x64x112x112xi1> scf.for %arg1 = %c0 to %c3211264 step %c1 { %0 = arith.remsi %arg1, %c112 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c112 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c112 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c112 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c112 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c112 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x112x112xf16> - %31 = arith.maxnumf %30, %cst : f16 - %32 = arith.cmpf ogt, %31, %cst : f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x64x112x112xf16> - memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x64x112x112xi1> + %1 = arith.divsi %arg1, %c112 : index + %2 = arith.remsi %1, %c112 : index + %3 = arith.divsi %1, %c112 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x112x112xf16> + %7 = arith.maximumf %6, %cst : f16 + %8 = arith.cmpf ogt, %7, %cst : f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x64x112x112xf16> + memref.store %8, %alloc_0[%5, %4, %2, %0] : memref<4x64x112x112xi1> } return %alloc, %alloc_0 : memref<4x64x112x112xf16>, memref<4x64x112x112xi1> } - func.func private @Unknown26(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c1 = arith.constant 1 : index + func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<4x64x56x56xf16> - %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> - scf.for %arg1 = %c0 to %c802816 step %c1 { - %0 = arith.remsi %arg1, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %31 = arith.maxnumf %30, %cst : f16 - %32 = arith.cmpf ogt, %31, %cst : f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16> - memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x64x56x56xi1> - } - return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> - } - func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<4x64x56x56xf16> - %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> - scf.for %arg2 = %c0 to %c802816 step %c1 { - %0 = arith.remsi %arg2, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %32 = arith.addf %30, %31 : f16 - %33 = arith.maxnumf %32, %cst : f16 - %34 = arith.cmpf ogt, %33, %cst : f16 - memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16> - memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x64x56x56xi1> - } - return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> - } - func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c802816 = arith.constant 802816 : index - %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> scf.for %arg1 = %c0 to %c802816 step %c1 { %0 = arith.remsi %arg1, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %31 = arith.maxnumf %30, %cst : f16 - %32 = arith.cmpf ogt, %31, %cst : f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16> - memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x64x56x56xi1> + %1 = arith.divsi %arg1, %c56 : index + %2 = arith.remsi %1, %c56 : index + %3 = arith.divsi %1, %c56 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x56x56xf16> + %7 = arith.maximumf %6, %cst : f16 + %8 = arith.cmpf ogt, %7, %cst : f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x64x56x56xf16> + memref.store %8, %alloc_0[%5, %4, %2, %0] : memref<4x64x56x56xi1> } return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> } - func.func private @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c1 = arith.constant 1 : index + func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} { %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c802816 = arith.constant 802816 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> %alloc_0 = memref.alloc() : memref<4x64x56x56xi1> scf.for %arg2 = %c0 to %c802816 step %c1 { %0 = arith.remsi %arg2, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %32 = arith.addf %30, %31 : f16 - %33 = arith.maxnumf %32, %cst : f16 - %34 = arith.cmpf ogt, %33, %cst : f16 - memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16> - memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x64x56x56xi1> + %1 = arith.divsi %arg2, %c56 : index + %2 = arith.remsi %1, %c56 : index + %3 = arith.divsi %1, %c56 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x56x56xf16> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x64x56x56xf16> + %8 = arith.addf %6, %7 : f16 + %9 = arith.maximumf %8, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + memref.store %9, %alloc[%5, %4, %2, %0] : memref<4x64x56x56xf16> + memref.store %10, %alloc_0[%5, %4, %2, %0] : memref<4x64x56x56xi1> } return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> } - func.func private @Unknown35(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c1 = arith.constant 1 : index + func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c401408 = arith.constant 401408 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> scf.for %arg1 = %c0 to %c401408 step %c1 { %0 = arith.remsi %arg1, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %31 = arith.maxnumf %30, %cst : f16 - %32 = arith.cmpf ogt, %31, %cst : f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16> - memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x128x28x28xi1> + %1 = arith.divsi %arg1, %c28 : index + %2 = arith.remsi %1, %c28 : index + %3 = arith.divsi %1, %c28 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x128x28x28xf16> + %7 = arith.maximumf %6, %cst : f16 + %8 = arith.cmpf ogt, %7, %cst : f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x128x28x28xf16> + memref.store %8, %alloc_0[%5, %4, %2, %0] : memref<4x128x28x28xi1> } return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> } - func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c1 = arith.constant 1 : index + func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c401408 = arith.constant 401408 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> scf.for %arg2 = %c0 to %c401408 step %c1 { %0 = arith.remsi %arg2, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %32 = arith.addf %30, %31 : f16 - %33 = arith.maxnumf %32, %cst : f16 - %34 = arith.cmpf ogt, %33, %cst : f16 - memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16> - memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x128x28x28xi1> + %1 = arith.divsi %arg2, %c28 : index + %2 = arith.remsi %1, %c28 : index + %3 = arith.divsi %1, %c28 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x128x28x28xf16> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x128x28x28xf16> + %8 = arith.addf %6, %7 : f16 + %9 = arith.maximumf %8, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + memref.store %9, %alloc[%5, %4, %2, %0] : memref<4x128x28x28xf16> + memref.store %10, %alloc_0[%5, %4, %2, %0] : memref<4x128x28x28xi1> } return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> } - func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index + func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index %c1 = arith.constant 1 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<4x128x28x28xf16> - %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> - scf.for %arg1 = %c0 to %c401408 step %c1 { - %0 = arith.remsi %arg1, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %31 = arith.maxnumf %30, %cst : f16 - %32 = arith.cmpf ogt, %31, %cst : f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16> - memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x128x28x28xi1> - } - return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> - } - func.func private @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c1 = arith.constant 1 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<4x128x28x28xf16> - %alloc_0 = memref.alloc() : memref<4x128x28x28xi1> - scf.for %arg2 = %c0 to %c401408 step %c1 { - %0 = arith.remsi %arg2, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %32 = arith.addf %30, %31 : f16 - %33 = arith.maxnumf %32, %cst : f16 - %34 = arith.cmpf ogt, %33, %cst : f16 - memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16> - memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x128x28x28xi1> - } - return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> - } - func.func private @Unknown44(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> scf.for %arg1 = %c0 to %c200704 step %c1 { %0 = arith.remsi %arg1, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %31 = arith.maxnumf %30, %cst : f16 - %32 = arith.cmpf ogt, %31, %cst : f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16> - memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x256x14x14xi1> + %1 = arith.divsi %arg1, %c14 : index + %2 = arith.remsi %1, %c14 : index + %3 = arith.divsi %1, %c14 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x256x14x14xf16> + %7 = arith.maximumf %6, %cst : f16 + %8 = arith.cmpf ogt, %7, %cst : f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x256x14x14xf16> + memref.store %8, %alloc_0[%5, %4, %2, %0] : memref<4x256x14x14xi1> } return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> } - func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index + func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<4x256x14x14xf16> - %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> - scf.for %arg2 = %c0 to %c200704 step %c1 { - %0 = arith.remsi %arg2, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %32 = arith.addf %30, %31 : f16 - %33 = arith.maxnumf %32, %cst : f16 - %34 = arith.cmpf ogt, %33, %cst : f16 - memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16> - memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x256x14x14xi1> - } - return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> - } - func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<4x256x14x14xf16> - %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> - scf.for %arg1 = %c0 to %c200704 step %c1 { - %0 = arith.remsi %arg1, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %31 = arith.maxnumf %30, %cst : f16 - %32 = arith.cmpf ogt, %31, %cst : f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16> - memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x256x14x14xi1> - } - return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> - } - func.func private @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> %alloc_0 = memref.alloc() : memref<4x256x14x14xi1> scf.for %arg2 = %c0 to %c200704 step %c1 { %0 = arith.remsi %arg2, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %32 = arith.addf %30, %31 : f16 - %33 = arith.maxnumf %32, %cst : f16 - %34 = arith.cmpf ogt, %33, %cst : f16 - memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16> - memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x256x14x14xi1> + %1 = arith.divsi %arg2, %c14 : index + %2 = arith.remsi %1, %c14 : index + %3 = arith.divsi %1, %c14 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x256x14x14xf16> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x256x14x14xf16> + %8 = arith.addf %6, %7 : f16 + %9 = arith.maximumf %8, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + memref.store %9, %alloc[%5, %4, %2, %0] : memref<4x256x14x14xf16> + memref.store %10, %alloc_0[%5, %4, %2, %0] : memref<4x256x14x14xi1> } return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> } - func.func private @Unknown53(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index + func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<4x512x7x7xf16> - %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> - scf.for %arg1 = %c0 to %c100352 step %c1 { - %0 = arith.remsi %arg1, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xf16> - %31 = arith.maxnumf %30, %cst : f16 - %32 = arith.cmpf ogt, %31, %cst : f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16> - memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x512x7x7xi1> - } - return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> - } - func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<4x512x7x7xf16> - %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> - scf.for %arg2 = %c0 to %c100352 step %c1 { - %0 = arith.remsi %arg2, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xf16> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xf16> - %32 = arith.addf %30, %31 : f16 - %33 = arith.maxnumf %32, %cst : f16 - %34 = arith.cmpf ogt, %33, %cst : f16 - memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16> - memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x512x7x7xi1> - } - return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> - } - func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> scf.for %arg1 = %c0 to %c100352 step %c1 { %0 = arith.remsi %arg1, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xf16> - %31 = arith.maxnumf %30, %cst : f16 - %32 = arith.cmpf ogt, %31, %cst : f16 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16> - memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x512x7x7xi1> + %1 = arith.divsi %arg1, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = arith.remsi %3, %c512 : index + %5 = arith.divsi %3, %c512 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x512x7x7xf16> + %7 = arith.maximumf %6, %cst : f16 + %8 = arith.cmpf ogt, %7, %cst : f16 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x512x7x7xf16> + memref.store %8, %alloc_0[%5, %4, %2, %0] : memref<4x512x7x7xi1> } return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> } - func.func private @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index + func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} { %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c100352 = arith.constant 100352 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> %alloc_0 = memref.alloc() : memref<4x512x7x7xi1> scf.for %arg2 = %c0 to %c100352 step %c1 { %0 = arith.remsi %arg2, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xf16> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xf16> - %32 = arith.addf %30, %31 : f16 - %33 = arith.maxnumf %32, %cst : f16 - %34 = arith.cmpf ogt, %33, %cst : f16 - memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16> - memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x512x7x7xi1> + %1 = arith.divsi %arg2, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = arith.remsi %3, %c512 : index + %5 = arith.divsi %3, %c512 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x512x7x7xf16> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x512x7x7xf16> + %8 = arith.addf %6, %7 : f16 + %9 = arith.maximumf %8, %cst : f16 + %10 = arith.cmpf ogt, %9, %cst : f16 + memref.store %9, %alloc[%5, %4, %2, %0] : memref<4x512x7x7xf16> + memref.store %10, %alloc_0[%5, %4, %2, %0] : memref<4x512x7x7xi1> } return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> } - func.func private @Unknown60(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 2.040100e-02 : f16 + func.func private @Unknown62(%arg0: memref<4x512x7x7xf16>) -> memref<4x512xf16> attributes {__byteir_reduction_fusion__} { %c0 = arith.constant 0 : index - %c2048 = arith.constant 2048 : index - %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %c64 = arith.constant 64 : index + %c49 = arith.constant 49 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<4x512x7x7xf16> into memref<2048x49xf16> + %alloc = memref.alloc() : memref<2048xf16> + scf.forall (%arg1) in (2048) { + %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = arith.remsi %arg2, %c64 : index + %1 = arith.cmpi slt, %0, %c0 : index + %2 = arith.addi %0, %c64 : index + %3 = arith.select %1, %2, %0 : index + %4 = arith.cmpi slt, %3, %c49 : index + %5 = arith.select %4, %3, %c49 : index + %6 = arith.addi %3, %c1 : index + %7 = arith.cmpi slt, %6, %c49 : index + %8 = arith.select %7, %6, %c49 : index + %9 = arith.subi %8, %5 : index + %subview_6 = memref.subview %expand_shape_0[0, %5] [1, %9] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %10 = arith.cmpi ugt, %9, %c0 : index + %11 = scf.if %10 -> (f16) { + %13 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %13 : f16 + } else { + scf.yield %cst : f16 + } + %12 = arith.addf %11, %cst : f16 + memref.store %12, %alloca[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<2048xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<2048xf16> into memref<4x512xf16> + return %expand_shape : memref<4x512xf16> + } + func.func private @Unknown63(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} { %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 2.040100e-02 : f16 + %c2048 = arith.constant 2048 : index %alloc = memref.alloc() : memref<4x512xf16> scf.for %arg1 = %c0 to %c2048 step %c1 { %0 = arith.remsi %arg1, %c512 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c512 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c512 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3] : memref<4x512xf16> - %11 = arith.mulf %10, %cst : f16 - memref.store %11, %alloc[%9, %3] : memref<4x512xf16> + %1 = arith.divsi %arg1, %c512 : index + %2 = memref.load %arg0[%1, %0] : memref<4x512xf16> + %3 = arith.mulf %2, %cst : f16 + memref.store %3, %alloc[%1, %0] : memref<4x512xf16> } return %alloc : memref<4x512xf16> } - func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c4000 = arith.constant 4000 : index - %c1 = arith.constant 1 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<4x1000xf16> scf.for %arg2 = %c0 to %c4000 step %c1 { %0 = arith.remsi %arg2, %c1000 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c1000 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c1000 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg1[%9, %3] : memref<4x1000xf16> - %11 = memref.load %arg0[%3] : memref<1000xf32> - %12 = arith.truncf %11 : f32 to f16 - %13 = arith.addf %10, %12 : f16 - memref.store %13, %alloc[%9, %3] : memref<4x1000xf16> + %1 = arith.divsi %arg2, %c1000 : index + %2 = memref.load %arg0[%0] : memref<1000xf16> + %3 = memref.load %arg1[%1, %0] : memref<4x1000xf16> + %4 = arith.addf %3, %2 : f16 + memref.store %4, %alloc[%1, %0] : memref<4x1000xf16> } return %alloc : memref<4x1000xf16> } - func.func private @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown65(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index %c1 = arith.constant 1 : index - %c1000 = arith.constant 1000 : index + %cst = arith.constant 0.000000e+00 : f16 + %c2 = arith.constant 2 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index + %c-1024 = arith.constant -1024 : index + %c1000 = arith.constant 1000 : index + %alloc = memref.alloc() : memref<4xf16> + scf.forall (%arg1) in (4) { + %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + scf.forall (%arg2) in (512) { + %0 = arith.muli %arg2, %c2 : index + %1 = arith.cmpi slt, %arg2, %c0 : index + %2 = arith.subi %c-1, %arg2 : index + %3 = arith.select %1, %2, %arg2 : index + %4 = arith.divsi %3, %c512 : index + %5 = arith.subi %c-1, %4 : index + %6 = arith.select %1, %5, %4 : index + %7 = arith.muli %6, %c-1024 : index + %8 = arith.addi %0, %7 : index + %9 = arith.cmpi slt, %8, %c1000 : index + %10 = arith.select %9, %8, %c1000 : index + %11 = arith.addi %8, %c2 : index + %12 = arith.cmpi slt, %11, %c1000 : index + %13 = arith.select %12, %11, %c1000 : index + %14 = arith.subi %13, %10 : index + %subview_8 = memref.subview %expand_shape[0, %10] [1, %14] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %15 = arith.cmpi ugt, %14, %c0 : index + %16 = scf.if %15 -> (f16) { + %20 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %20 : f16 + } else { + scf.yield %cst : f16 + } + %17 = arith.cmpi ugt, %14, %c1 : index + %18 = scf.if %17 -> (f16) { + %20 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %20 : f16 + } else { + scf.yield %cst : f16 + } + %19 = arith.maximumf %16, %18 : f16 + memref.store %19, %alloca[%arg2] : memref<512xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space> + scf.forall (%arg2) in (256) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space> + %2 = arith.addi %0, %c1 : index + %3 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space> + scf.forall (%arg2) in (128) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space> + %2 = arith.addi %0, %c1 : index + %3 = memref.load %alloca_0[%2] : memref<256xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space> + %2 = arith.addi %0, %c1 : index + %3 = memref.load %alloca_1[%2] : memref<128xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addi %0, %c1 : index + %3 = memref.load %alloca_2[%2] : memref<64xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addi %0, %c1 : index + %3 = memref.load %alloca_3[%2] : memref<32xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addi %0, %c1 : index + %3 = memref.load %alloca_4[%2] : memref<16xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addi %0, %c1 : index + %3 = memref.load %alloca_5[%2] : memref<8xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addi %0, %c1 : index + %3 = memref.load %alloca_6[%2] : memref<4xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addi %0, %c1 : index + %3 = memref.load %alloca_7[%2] : memref<2xf16, #gpu.address_space> + %4 = arith.maximumf %3, %1 : f16 + memref.store %4, %alloc[%arg1] : memref<4xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref<4xf16> + } + func.func private @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c4000 = arith.constant 4000 : index %alloc = memref.alloc() : memref<4x1000xf16> - %alloc_0 = memref.alloc() : memref<4x1000xf16> scf.for %arg2 = %c0 to %c4000 step %c1 { %0 = arith.remsi %arg2, %c1000 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c1000 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c1000 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg1[%9, %3] : memref<4x1000xf16> - %11 = memref.load %arg0[%9] : memref<4xf16> - %12 = arith.subf %10, %11 : f16 - %13 = math.exp %12 : f16 - memref.store %12, %alloc[%9, %3] : memref<4x1000xf16> - memref.store %13, %alloc_0[%9, %3] : memref<4x1000xf16> + %1 = arith.divsi %arg2, %c1000 : index + %2 = memref.load %arg0[%1] : memref<4xf16> + %3 = memref.load %arg1[%1, %0] : memref<4x1000xf16> + %4 = arith.subf %3, %2 : f16 + memref.store %4, %alloc[%1, %0] : memref<4x1000xf16> } - return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16> + return %alloc : memref<4x1000xf16> } - func.func private @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown67(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index %c1 = arith.constant 1 : index - %c1000 = arith.constant 1000 : index + %cst = arith.constant 0.000000e+00 : f16 + %c2 = arith.constant 2 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index + %c-1024 = arith.constant -1024 : index + %c1000 = arith.constant 1000 : index + %alloc = memref.alloc() : memref<4xf16> + scf.forall (%arg1) in (4) { + %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + scf.forall (%arg2) in (512) { + %0 = arith.muli %arg2, %c2 : index + %1 = arith.cmpi slt, %arg2, %c0 : index + %2 = arith.subi %c-1, %arg2 : index + %3 = arith.select %1, %2, %arg2 : index + %4 = arith.divsi %3, %c512 : index + %5 = arith.subi %c-1, %4 : index + %6 = arith.select %1, %5, %4 : index + %7 = arith.muli %6, %c-1024 : index + %8 = arith.addi %0, %7 : index + %9 = arith.cmpi slt, %8, %c1000 : index + %10 = arith.select %9, %8, %c1000 : index + %11 = arith.addi %8, %c2 : index + %12 = arith.cmpi slt, %11, %c1000 : index + %13 = arith.select %12, %11, %c1000 : index + %14 = arith.subi %13, %10 : index + %subview_8 = memref.subview %expand_shape[0, %10] [1, %14] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %15 = arith.cmpi ugt, %14, %c0 : index + %16 = scf.if %15 -> (f16) { + %23 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %23 : f16 + } else { + scf.yield %cst : f16 + } + %17 = math.exp %16 : f16 + %18 = arith.addf %17, %cst : f16 + %19 = arith.cmpi ugt, %14, %c1 : index + %20 = scf.if %19 -> (f16) { + %23 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %23 : f16 + } else { + scf.yield %cst : f16 + } + %21 = math.exp %20 : f16 + %22 = arith.addf %18, %21 : f16 + memref.store %22, %alloca[%arg2] : memref<512xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space> + scf.forall (%arg2) in (256) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space> + scf.forall (%arg2) in (128) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space> + scf.forall (%arg2) in (64) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space> + scf.forall (%arg2) in (32) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space> + scf.forall (%arg2) in (16) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space> + scf.forall (%arg2) in (8) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space> + scf.forall (%arg2) in (4) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space> + scf.forall (%arg2) in (2) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg2) in (1) { + %0 = arith.muli %arg2, %c2 : index + %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space> + %2 = arith.addf %1, %cst : f16 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space> + %5 = arith.addf %4, %2 : f16 + memref.store %5, %alloc[%arg1] : memref<4xf16> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref<4xf16> + } + func.func private @Unknown68(%arg0: memref<4xf16>) -> memref<4xf16> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<4xf16> + scf.for %arg1 = %c0 to %c4 step %c1 { + %0 = memref.load %arg0[%arg1] : memref<4xf16> + %1 = math.log %0 : f16 + memref.store %1, %alloc[%arg1] : memref<4xf16> + } + return %alloc : memref<4xf16> + } + func.func private @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} { + %c1000 = arith.constant 1000 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c4000 = arith.constant 4000 : index %alloc = memref.alloc() : memref<4x1000xf16> - %alloc_0 = memref.alloc() : memref<4x1000xf32> - %alloc_1 = memref.alloc() : memref<4x1000xf32> - scf.for %arg5 = %c0 to %c4000 step %c1 { - %0 = arith.remsi %arg5, %c1000 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c1000 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg5, %c0 : index - %5 = arith.subi %c-1, %arg5 : index - %6 = arith.select %4, %5, %arg5 : index - %7 = arith.divsi %6, %c1000 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg3[%9, %3] : memref<4x1000xf16> - %11 = memref.load %arg1[%9, %3] : memref<4x1000xf16> - %12 = memref.load %arg0[%9] : memref<4xf16> - %13 = memref.load %arg2[%9] : memref<4xf16> - %14 = memref.load %arg4[%9, %3] : memref<4x1000xf32> - %15 = math.log %12 : f16 - %16 = arith.subf %11, %15 : f16 - %17 = math.exp %16 : f16 - %18 = arith.mulf %17, %13 : f16 - %19 = arith.subf %10, %18 : f16 - %20 = arith.extf %16 : f16 to f32 - %21 = arith.mulf %20, %14 : f32 - %22 = arith.extf %19 : f16 to f32 - memref.store %19, %alloc[%9, %3] : memref<4x1000xf16> - memref.store %21, %alloc_0[%9, %3] : memref<4x1000xf32> - memref.store %22, %alloc_1[%9, %3] : memref<4x1000xf32> + %alloc_0 = memref.alloc() : memref<4x1000xf16> + scf.for %arg4 = %c0 to %c4000 step %c1 { + %0 = arith.remsi %arg4, %c1000 : index + %1 = arith.divsi %arg4, %c1000 : index + %2 = memref.load %arg2[%1] : memref<4xf16> + %3 = memref.load %arg0[%1] : memref<4xf16> + %4 = memref.load %arg1[%1, %0] : memref<4x1000xf16> + %5 = memref.load %arg3[%1, %0] : memref<4x1000xf16> + %6 = arith.subf %4, %3 : f16 + %7 = math.exp %6 : f16 + %8 = arith.mulf %7, %2 : f16 + %9 = arith.subf %5, %8 : f16 + memref.store %6, %alloc[%1, %0] : memref<4x1000xf16> + memref.store %9, %alloc_0[%1, %0] : memref<4x1000xf16> } - return %alloc, %alloc_0, %alloc_1 : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32> + return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16> } - func.func private @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { + %c7 = arith.constant 7 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %cst_0 = arith.constant 4.900000e+01 : f16 - %c0 = arith.constant 0 : index %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> scf.for %arg2 = %c0 to %c100352 step %c1 { %0 = arith.remsi %arg2, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xi1> - %31 = memref.load %arg0[%29, %23] : memref<4x512xf16> - %32 = arith.divf %31, %cst_0 : f16 - %33 = arith.select %30, %32, %cst : f16 - memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16> + %1 = arith.divsi %arg2, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = arith.remsi %3, %c512 : index + %5 = arith.divsi %3, %c512 : index + %6 = memref.load %arg0[%5, %4] : memref<4x512xf16> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x512x7x7xi1> + %8 = arith.divf %6, %cst_0 : f16 + %9 = arith.select %7, %8, %cst : f16 + memref.store %9, %alloc[%5, %4, %2, %0] : memref<4x512x7x7xf16> } return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index + func.func private @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c100352 = arith.constant 100352 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> scf.for %arg2 = %c0 to %c100352 step %c1 { %0 = arith.remsi %arg2, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xi1> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xf16> - %32 = arith.select %30, %31, %cst : f16 - memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16> + %1 = arith.divsi %arg2, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = arith.remsi %3, %c512 : index + %5 = arith.divsi %3, %c512 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x512x7x7xi1> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x512x7x7xf16> + %8 = arith.select %6, %7, %cst : f16 + memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x512x7x7xf16> } return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c1 = arith.constant 1 : index + func.func private @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c100352 = arith.constant 100352 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> scf.for %arg3 = %c0 to %c100352 step %c1 { %0 = arith.remsi %arg3, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x512x7x7xi1> - %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xf16> - %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xf16> - %33 = arith.addf %31, %32 : f16 - %34 = arith.select %30, %33, %cst : f16 - memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16> + %1 = arith.divsi %arg3, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = arith.remsi %3, %c512 : index + %5 = arith.divsi %3, %c512 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x512x7x7xf16> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x512x7x7xf16> + %8 = memref.load %arg2[%5, %4, %2, %0] : memref<4x512x7x7xi1> + %9 = arith.addf %6, %7 : f16 + %10 = arith.select %8, %9, %cst : f16 + memref.store %10, %alloc[%5, %4, %2, %0] : memref<4x512x7x7xf16> } return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index + func.func private @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { + %c14 = arith.constant 14 : index + %c256 = arith.constant 256 : index %c1 = arith.constant 1 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<4x512x7x7xf16> - scf.for %arg2 = %c0 to %c100352 step %c1 { - %0 = arith.remsi %arg2, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xi1> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xf16> - %32 = arith.select %30, %31, %cst : f16 - memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16> - } - return %alloc : memref<4x512x7x7xf16> - } - func.func private @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> scf.for %arg3 = %c0 to %c200704 step %c1 { %0 = arith.remsi %arg3, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x256x14x14xi1> - %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %33 = arith.addf %31, %32 : f16 - %34 = arith.select %30, %33, %cst : f16 - memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16> + %1 = arith.divsi %arg3, %c14 : index + %2 = arith.remsi %1, %c14 : index + %3 = arith.divsi %1, %c14 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x256x14x14xf16> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x256x14x14xf16> + %8 = memref.load %arg2[%5, %4, %2, %0] : memref<4x256x14x14xi1> + %9 = arith.addf %6, %7 : f16 + %10 = arith.select %8, %9, %cst : f16 + memref.store %10, %alloc[%5, %4, %2, %0] : memref<4x256x14x14xf16> } return %alloc : memref<4x256x14x14xf16> } - func.func private @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index + func.func private @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<4x256x14x14xf16> - scf.for %arg2 = %c0 to %c200704 step %c1 { - %0 = arith.remsi %arg2, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xi1> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %32 = arith.select %30, %31, %cst : f16 - memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16> - } - return %alloc : memref<4x256x14x14xf16> - } - func.func private @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<4x256x14x14xf16> - scf.for %arg3 = %c0 to %c200704 step %c1 { - %0 = arith.remsi %arg3, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x256x14x14xi1> - %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %33 = arith.addf %31, %32 : f16 - %34 = arith.select %30, %33, %cst : f16 - memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16> - } - return %alloc : memref<4x256x14x14xf16> - } - func.func private @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c200704 = arith.constant 200704 : index - %c1 = arith.constant 1 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> scf.for %arg2 = %c0 to %c200704 step %c1 { %0 = arith.remsi %arg2, %c14 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c14 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c14 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c14 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c14 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c14 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xi1> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16> - %32 = arith.select %30, %31, %cst : f16 - memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16> + %1 = arith.divsi %arg2, %c14 : index + %2 = arith.remsi %1, %c14 : index + %3 = arith.divsi %1, %c14 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x256x14x14xi1> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x256x14x14xf16> + %8 = arith.select %6, %7, %cst : f16 + memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x256x14x14xf16> } return %alloc : memref<4x256x14x14xf16> } - func.func private @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c1 = arith.constant 1 : index + func.func private @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<4x128x28x28xf16> - scf.for %arg3 = %c0 to %c401408 step %c1 { - %0 = arith.remsi %arg3, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x128x28x28xi1> - %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %33 = arith.addf %31, %32 : f16 - %34 = arith.select %30, %33, %cst : f16 - memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16> - } - return %alloc : memref<4x128x28x28xf16> - } - func.func private @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index %c1 = arith.constant 1 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<4x128x28x28xf16> - scf.for %arg2 = %c0 to %c401408 step %c1 { - %0 = arith.remsi %arg2, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xi1> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %32 = arith.select %30, %31, %cst : f16 - memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16> - } - return %alloc : memref<4x128x28x28xf16> - } - func.func private @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c401408 = arith.constant 401408 : index - %c1 = arith.constant 1 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> scf.for %arg3 = %c0 to %c401408 step %c1 { %0 = arith.remsi %arg3, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x128x28x28xi1> - %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %33 = arith.addf %31, %32 : f16 - %34 = arith.select %30, %33, %cst : f16 - memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16> + %1 = arith.divsi %arg3, %c28 : index + %2 = arith.remsi %1, %c28 : index + %3 = arith.divsi %1, %c28 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x128x28x28xf16> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x128x28x28xf16> + %8 = memref.load %arg2[%5, %4, %2, %0] : memref<4x128x28x28xi1> + %9 = arith.addf %6, %7 : f16 + %10 = arith.select %8, %9, %cst : f16 + memref.store %10, %alloc[%5, %4, %2, %0] : memref<4x128x28x28xf16> } return %alloc : memref<4x128x28x28xf16> } - func.func private @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c1 = arith.constant 1 : index + func.func private @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} { %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c401408 = arith.constant 401408 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> scf.for %arg2 = %c0 to %c401408 step %c1 { %0 = arith.remsi %arg2, %c28 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c28 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c28 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c28 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c28 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c28 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xi1> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16> - %32 = arith.select %30, %31, %cst : f16 - memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16> + %1 = arith.divsi %arg2, %c28 : index + %2 = arith.remsi %1, %c28 : index + %3 = arith.divsi %1, %c28 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x128x28x28xi1> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x128x28x28xf16> + %8 = arith.select %6, %7, %cst : f16 + memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x128x28x28xf16> } return %alloc : memref<4x128x28x28xf16> } - func.func private @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c1 = arith.constant 1 : index + func.func private @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<4x64x56x56xf16> - scf.for %arg3 = %c0 to %c802816 step %c1 { - %0 = arith.remsi %arg3, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x64x56x56xi1> - %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %33 = arith.addf %31, %32 : f16 - %34 = arith.select %30, %33, %cst : f16 - memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16> - } - return %alloc : memref<4x64x56x56xf16> - } - func.func private @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<4x64x56x56xf16> - scf.for %arg2 = %c0 to %c802816 step %c1 { - %0 = arith.remsi %arg2, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xi1> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %32 = arith.select %30, %31, %cst : f16 - memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16> - } - return %alloc : memref<4x64x56x56xf16> - } - func.func private @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 %c802816 = arith.constant 802816 : index - %c1 = arith.constant 1 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> scf.for %arg3 = %c0 to %c802816 step %c1 { %0 = arith.remsi %arg3, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg3, %c0 : index - %5 = arith.subi %c-1, %arg3 : index - %6 = arith.select %4, %5, %arg3 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x64x56x56xi1> - %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %33 = arith.addf %31, %32 : f16 - %34 = arith.select %30, %33, %cst : f16 - memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16> + %1 = arith.divsi %arg3, %c56 : index + %2 = arith.remsi %1, %c56 : index + %3 = arith.divsi %1, %c56 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x56x56xf16> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x64x56x56xf16> + %8 = memref.load %arg2[%5, %4, %2, %0] : memref<4x64x56x56xi1> + %9 = arith.addf %6, %7 : f16 + %10 = arith.select %8, %9, %cst : f16 + memref.store %10, %alloc[%5, %4, %2, %0] : memref<4x64x56x56xf16> } return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c1 = arith.constant 1 : index + func.func private @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c802816 = arith.constant 802816 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> scf.for %arg2 = %c0 to %c802816 step %c1 { %0 = arith.remsi %arg2, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xi1> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %32 = arith.select %30, %31, %cst : f16 - memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16> + %1 = arith.divsi %arg2, %c56 : index + %2 = arith.remsi %1, %c56 : index + %3 = arith.divsi %1, %c56 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x56x56xi1> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x64x56x56xf16> + %8 = arith.select %6, %7, %cst : f16 + memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x64x56x56xf16> } return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c1 = arith.constant 1 : index + func.func private @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} { %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c802816 = arith.constant 802816 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> scf.for %arg2 = %c0 to %c802816 step %c1 { %0 = arith.remsi %arg2, %c56 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c56 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c56 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c56 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c56 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c56 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16> - %32 = arith.addf %30, %31 : f16 - memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16> + %1 = arith.divsi %arg2, %c56 : index + %2 = arith.remsi %1, %c56 : index + %3 = arith.divsi %1, %c56 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x56x56xf16> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x64x56x56xf16> + %8 = arith.addf %6, %7 : f16 + memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x64x56x56xf16> } return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c3211264 = arith.constant 3211264 : index - %c1 = arith.constant 1 : index + func.func private @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} { %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c3211264 = arith.constant 3211264 : index %alloc = memref.alloc() : memref<4x64x112x112xf16> scf.for %arg2 = %c0 to %c3211264 step %c1 { %0 = arith.remsi %arg2, %c112 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c112 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg2, %c0 : index - %5 = arith.subi %c-1, %arg2 : index - %6 = arith.select %4, %5, %arg2 : index - %7 = arith.divsi %6, %c112 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c112 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c112 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c112 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x112x112xi1> - %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x112x112xf16> - %32 = arith.select %30, %31, %cst : f16 - memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x64x112x112xf16> + %1 = arith.divsi %arg2, %c112 : index + %2 = arith.remsi %1, %c112 : index + %3 = arith.divsi %1, %c112 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x112x112xi1> + %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x64x112x112xf16> + %8 = arith.select %6, %7, %cst : f16 + memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x64x112x112xf16> } return %alloc : memref<4x64x112x112xf16> } - func.func private @Unknown141(%arg0: memref) -> memref attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown147(%arg0: memref<4x1000xf16>, %arg1: memref<4x1000xf32>) -> memref attributes {__byteir_reduction_fusion__} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %cst_0 = arith.constant 0.000000e+00 : f32 + %c128 = arith.constant 128 : index + %c125 = arith.constant 125 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %alloc = memref.alloc() : memref + %collapse_shape = memref.collapse_shape %arg0 [[0, 1]] : memref<4x1000xf16> into memref<4000xf16> + %collapse_shape_1 = memref.collapse_shape %arg1 [[0, 1]] : memref<4x1000xf32> into memref<4000xf32> + %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<4000xf16> into memref<32x125xf16> + %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] : memref<4000xf32> into memref<32x125xf32> + %alloc_3 = memref.alloc() : memref<32xf32> + scf.forall (%arg2) in (32) { + %subview = memref.subview %expand_shape[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>> + %expand_shape_4 = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>> + %subview_5 = memref.subview %expand_shape_2[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>> + %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>> + %alloca = memref.alloca() : memref<128xf32, #gpu.address_space> + scf.forall (%arg3) in (128) { + %0 = arith.remsi %arg3, %c128 : index + %1 = arith.cmpi slt, %0, %c0 : index + %2 = arith.addi %0, %c128 : index + %3 = arith.select %1, %2, %0 : index + %4 = arith.cmpi slt, %3, %c125 : index + %5 = arith.select %4, %3, %c125 : index + %6 = arith.addi %3, %c1 : index + %7 = arith.cmpi slt, %6, %c125 : index + %8 = arith.select %7, %6, %c125 : index + %9 = arith.subi %8, %5 : index + %subview_13 = memref.subview %expand_shape_4[0, %5] [1, %9] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref> + %expand_shape_14 = memref.expand_shape %subview_13 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %subview_15 = memref.subview %expand_shape_6[0, %5] [1, %9] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref> + %expand_shape_16 = memref.expand_shape %subview_15 [[0, 1]] : memref> into memref<1x?xf32, strided<[?, 1], offset: ?>> + %10 = arith.cmpi ugt, %9, %c0 : index + %11:2 = scf.if %10 -> (f16, f32) { + %15 = memref.load %expand_shape_14[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + %16 = memref.load %expand_shape_16[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>> + scf.yield %15, %16 : f16, f32 + } else { + scf.yield %cst, %cst_0 : f16, f32 + } + %12 = arith.extf %11#0 : f16 to f32 + %13 = arith.mulf %12, %11#1 : f32 + %14 = arith.addf %13, %cst_0 : f32 + memref.store %14, %alloca[%arg3] : memref<128xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space> + scf.forall (%arg3) in (64) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca[%0] : memref<128xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca[%3] : memref<128xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_7[%arg3] : memref<64xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space> + scf.forall (%arg3) in (32) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca_7[%0] : memref<64xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_7[%3] : memref<64xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_8[%arg3] : memref<32xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space> + scf.forall (%arg3) in (16) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca_8[%0] : memref<32xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_8[%3] : memref<32xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_9[%arg3] : memref<16xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space> + scf.forall (%arg3) in (8) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca_9[%0] : memref<16xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_9[%3] : memref<16xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_10[%arg3] : memref<8xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space> + scf.forall (%arg3) in (4) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca_10[%0] : memref<8xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_10[%3] : memref<8xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_11[%arg3] : memref<4xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space> + scf.forall (%arg3) in (2) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca_11[%0] : memref<4xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_11[%3] : memref<4xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_12[%arg3] : memref<2xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg3) in (1) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca_12[%0] : memref<2xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_12[%3] : memref<2xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloc_3[%arg2] : memref<32xf32> + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + scf.forall (%arg2) in (1) { + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + scf.forall (%arg3) in (32) { + %0 = arith.muli %arg2, %c32 : index + %1 = arith.addi %0, %arg3 : index + %2 = memref.load %alloc_3[%1] : memref<32xf32> + %3 = arith.addf %2, %cst_0 : f32 + memref.store %3, %alloca[%arg3] : memref<32xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_4 = memref.alloca() : memref<16xf32, #gpu.address_space> + scf.forall (%arg3) in (16) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca[%0] : memref<32xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca[%3] : memref<32xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_4[%arg3] : memref<16xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_5 = memref.alloca() : memref<8xf32, #gpu.address_space> + scf.forall (%arg3) in (8) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca_4[%0] : memref<16xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_4[%3] : memref<16xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_5[%arg3] : memref<8xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_6 = memref.alloca() : memref<4xf32, #gpu.address_space> + scf.forall (%arg3) in (4) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca_5[%0] : memref<8xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_5[%3] : memref<8xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_6[%arg3] : memref<4xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %alloca_7 = memref.alloca() : memref<2xf32, #gpu.address_space> + scf.forall (%arg3) in (2) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca_6[%0] : memref<4xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_6[%3] : memref<4xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloca_7[%arg3] : memref<2xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + scf.forall (%arg3) in (1) { + %0 = arith.muli %arg3, %c2 : index + %1 = memref.load %alloca_7[%0] : memref<2xf32, #gpu.address_space> + %2 = arith.addf %1, %cst_0 : f32 + %3 = arith.addi %0, %c1 : index + %4 = memref.load %alloca_7[%3] : memref<2xf32, #gpu.address_space> + %5 = arith.addf %4, %2 : f32 + memref.store %5, %alloc[] : memref + } {mapping = [#gpu.thread]} + } {mapping = [#gpu.block]} + return %alloc : memref + } + func.func private @Unknown148(%arg0: memref) -> memref attributes {__byteir_elementwise_fusion__} { %cst = arith.constant 4.000000e+00 : f32 %alloc = memref.alloc() : memref %0 = memref.load %arg0[] : memref @@ -2772,871 +1536,292 @@ module @IrToMhlo.2452 { memref.store %2, %alloc[] : memref return %alloc : memref } - func.func private @Unknown142(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c1 = arith.constant 1 : index + func.func private @Unknown149(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} { %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c9408 = arith.constant 9408 : index %alloc = memref.alloc() : memref<64x3x7x7xf32> scf.for %arg1 = %c0 to %c9408 step %c1 { %0 = arith.remsi %arg1, %c7 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c7 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c7 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c7 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c7 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c7 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c3 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c3 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c3 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x3x7x7xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x3x7x7xf32> + %1 = arith.divsi %arg1, %c7 : index + %2 = arith.remsi %1, %c7 : index + %3 = arith.divsi %1, %c7 : index + %4 = arith.remsi %3, %c3 : index + %5 = arith.divsi %3, %c3 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x3x7x7xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x3x7x7xf32> } return %alloc : memref<64x3x7x7xf32> } - func.func private @Unknown143(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index + func.func private @Unknown150(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32> - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown144(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32> - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown145(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<64x64x3x3xf32> scf.for %arg1 = %c0 to %c36864 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x64x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x64x3x3xf32> } return %alloc : memref<64x64x3x3xf32> } - func.func private @Unknown146(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c1 = arith.constant 1 : index + func.func private @Unknown154(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - scf.for %arg1 = %c0 to %c36864 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32> - } - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown147(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} { + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c73728 = arith.constant 73728 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %alloc = memref.alloc() : memref<128x64x3x3xf32> scf.for %arg1 = %c0 to %c73728 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c64 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c64 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c64 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x64x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x64x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c64 : index + %5 = arith.divsi %3, %c64 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x64x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x64x3x3xf32> } return %alloc : memref<128x64x3x3xf32> } - func.func private @Unknown148(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c1 = arith.constant 1 : index + func.func private @Unknown155(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + %c1 = arith.constant 1 : index %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c147456 = arith.constant 147456 : index %alloc = memref.alloc() : memref<128x128x3x3xf32> scf.for %arg1 = %c0 to %c147456 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x128x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x128x3x3xf32> } return %alloc : memref<128x128x3x3xf32> } - func.func private @Unknown149(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown156(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c8192 = arith.constant 8192 : index - %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<128x64x1x1xf32> scf.for %arg1 = %c0 to %c8192 step %c1 { %0 = arith.remsi %arg1, %c64 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c64 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c64 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<128x64x1x1xf16> - %11 = arith.extf %10 : f16 to f32 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<128x64x1x1xf32> + %1 = arith.divsi %arg1, %c64 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<128x64x1x1xf16> + %3 = arith.extf %2 : f16 to f32 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<128x64x1x1xf32> } return %alloc : memref<128x64x1x1xf32> } - func.func private @Unknown150(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c1 = arith.constant 1 : index + func.func private @Unknown159(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<128x128x3x3xf32> - scf.for %arg1 = %c0 to %c147456 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32> - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown151(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %alloc = memref.alloc() : memref<128x128x3x3xf32> - scf.for %arg1 = %c0 to %c147456 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32> - } - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown152(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c294912 = arith.constant 294912 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %alloc = memref.alloc() : memref<256x128x3x3xf32> scf.for %arg1 = %c0 to %c294912 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c128 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c128 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c128 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x128x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x128x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c128 : index + %5 = arith.divsi %3, %c128 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x128x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x128x3x3xf32> } return %alloc : memref<256x128x3x3xf32> } - func.func private @Unknown153(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c1 = arith.constant 1 : index + func.func private @Unknown160(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + %c1 = arith.constant 1 : index %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index + %c589824 = arith.constant 589824 : index %alloc = memref.alloc() : memref<256x256x3x3xf32> scf.for %arg1 = %c0 to %c589824 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x256x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x256x3x3xf32> } return %alloc : memref<256x256x3x3xf32> } - func.func private @Unknown154(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown161(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c32768 = arith.constant 32768 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<256x128x1x1xf32> scf.for %arg1 = %c0 to %c32768 step %c1 { %0 = arith.remsi %arg1, %c128 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c128 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c128 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<256x128x1x1xf16> - %11 = arith.extf %10 : f16 to f32 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<256x128x1x1xf32> + %1 = arith.divsi %arg1, %c128 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<256x128x1x1xf16> + %3 = arith.extf %2 : f16 to f32 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<256x128x1x1xf32> } return %alloc : memref<256x128x1x1xf32> } - func.func private @Unknown155(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c1 = arith.constant 1 : index + func.func private @Unknown164(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<256x256x3x3xf32> - scf.for %arg1 = %c0 to %c589824 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32> - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown156(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %alloc = memref.alloc() : memref<256x256x3x3xf32> - scf.for %arg1 = %c0 to %c589824 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32> - } - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown157(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c1179648 = arith.constant 1179648 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x3x3xf32> scf.for %arg1 = %c0 to %c1179648 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c256 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c256 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c256 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x256x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x256x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c256 : index + %5 = arith.divsi %3, %c256 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x256x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x256x3x3xf32> } return %alloc : memref<512x256x3x3xf32> } - func.func private @Unknown158(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c1 = arith.constant 1 : index + func.func private @Unknown165(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + %c1 = arith.constant 1 : index %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index + %c2359296 = arith.constant 2359296 : index %alloc = memref.alloc() : memref<512x512x3x3xf32> scf.for %arg1 = %c0 to %c2359296 step %c1 { %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32> + %1 = arith.divsi %arg1, %c3 : index + %2 = arith.remsi %1, %c3 : index + %3 = arith.divsi %1, %c3 : index + %4 = arith.remsi %3, %c512 : index + %5 = arith.divsi %3, %c512 : index + %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x512x3x3xf16> + %7 = arith.extf %6 : f16 to f32 + memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x512x3x3xf32> } return %alloc : memref<512x512x3x3xf32> } - func.func private @Unknown159(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown166(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} { + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %c131072 = arith.constant 131072 : index - %c1 = arith.constant 1 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<512x256x1x1xf32> scf.for %arg1 = %c0 to %c131072 step %c1 { %0 = arith.remsi %arg1, %c256 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c256 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c256 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<512x256x1x1xf16> - %11 = arith.extf %10 : f16 to f32 - memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<512x256x1x1xf32> + %1 = arith.divsi %arg1, %c256 : index + %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<512x256x1x1xf16> + %3 = arith.extf %2 : f16 to f32 + memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<512x256x1x1xf32> } return %alloc : memref<512x256x1x1xf32> } - func.func private @Unknown160(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + func.func private @Unknown170(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<512x512x3x3xf32> - scf.for %arg1 = %c0 to %c2359296 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32> - } - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown161(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index %c1 = arith.constant 1 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %alloc = memref.alloc() : memref<512x512x3x3xf32> - scf.for %arg1 = %c0 to %c2359296 step %c1 { - %0 = arith.remsi %arg1, %c3 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c3 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c3 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = arith.remsi %9, %c3 : index - %11 = arith.cmpi slt, %10, %c0 : index - %12 = arith.addi %10, %c3 : index - %13 = arith.select %11, %12, %10 : index - %14 = arith.cmpi slt, %9, %c0 : index - %15 = arith.subi %c-1, %9 : index - %16 = arith.select %14, %15, %9 : index - %17 = arith.divsi %16, %c3 : index - %18 = arith.subi %c-1, %17 : index - %19 = arith.select %14, %18, %17 : index - %20 = arith.remsi %19, %c512 : index - %21 = arith.cmpi slt, %20, %c0 : index - %22 = arith.addi %20, %c512 : index - %23 = arith.select %21, %22, %20 : index - %24 = arith.cmpi slt, %19, %c0 : index - %25 = arith.subi %c-1, %19 : index - %26 = arith.select %24, %25, %19 : index - %27 = arith.divsi %26, %c512 : index - %28 = arith.subi %c-1, %27 : index - %29 = arith.select %24, %28, %27 : index - %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16> - %31 = arith.extf %30 : f16 to f32 - memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32> - } - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown163(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} { %c0 = arith.constant 0 : index %c512000 = arith.constant 512000 : index - %c1 = arith.constant 1 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %alloc = memref.alloc() : memref<1000x512xf32> scf.for %arg1 = %c0 to %c512000 step %c1 { %0 = arith.remsi %arg1, %c512 : index - %1 = arith.cmpi slt, %0, %c0 : index - %2 = arith.addi %0, %c512 : index - %3 = arith.select %1, %2, %0 : index - %4 = arith.cmpi slt, %arg1, %c0 : index - %5 = arith.subi %c-1, %arg1 : index - %6 = arith.select %4, %5, %arg1 : index - %7 = arith.divsi %6, %c512 : index - %8 = arith.subi %c-1, %7 : index - %9 = arith.select %4, %8, %7 : index - %10 = memref.load %arg0[%9, %3] : memref<1000x512xf16> - %11 = arith.extf %10 : f16 to f32 - memref.store %11, %alloc[%9, %3] : memref<1000x512xf32> + %1 = arith.divsi %arg1, %c512 : index + %2 = memref.load %arg0[%1, %0] : memref<1000x512xf16> + %3 = arith.extf %2 : f16 to f32 + memref.store %3, %alloc[%1, %0] : memref<1000x512xf32> } return %alloc : memref<1000x512xf32> } - func.func private @Unknown164(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { + func.func private @Unknown171(%arg0: memref<4x1000xf16>) -> memref<1000xf32> attributes {__byteir_reduction_fusion__} { %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %cst_0 = arith.constant 0.000000e+00 : f32 + %c-32 = arith.constant -32 : index %c1000 = arith.constant 1000 : index + %c32 = arith.constant 32 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<1000xf32> + scf.forall (%arg1) in (32) { + %0 = arith.muli %arg1, %c-32 : index + %1 = arith.addi %0, %c1000 : index + %2 = arith.cmpi slt, %1, %c32 : index + %3 = arith.select %2, %1, %c32 : index + %4 = arith.muli %arg1, %c32 : index + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space> + scf.forall (%arg2, %arg3) in (2, 32) { + %5 = arith.cmpi slt, %3, %arg3 : index + %6 = arith.select %5, %3, %arg3 : index + %7 = arith.addi %arg3, %c1 : index + %8 = arith.cmpi slt, %3, %7 : index + %9 = arith.select %8, %3, %7 : index + %10 = arith.subi %9, %6 : index + %11 = arith.cmpi ugt, %10, %c0 : index + %12 = scf.if %11 -> (f16) { + %18 = arith.muli %arg2, %c2 : index + %19 = arith.addi %4, %6 : index + %20 = memref.load %arg0[%18, %19] : memref<4x1000xf16> + scf.yield %20 : f16 + } else { + scf.yield %cst : f16 + } + %13 = arith.extf %12 : f16 to f32 + %14 = arith.addf %13, %cst_0 : f32 + %15 = scf.if %11 -> (f16) { + %18 = arith.muli %arg2, %c2 : index + %19 = arith.addi %18, %c1 : index + %20 = arith.addi %4, %6 : index + %21 = memref.load %arg0[%19, %20] : memref<4x1000xf16> + scf.yield %21 : f16 + } else { + scf.yield %cst : f16 + } + %16 = arith.extf %15 : f16 to f32 + %17 = arith.addf %14, %16 : f32 + memref.store %17, %alloca_1[%arg2, %arg3] : memref<2x32xf32, #gpu.address_space> + } {mapping = [#gpu.thread, #gpu.thread]} + scf.forall (%arg2) in (32) { + %5 = memref.load %alloca_1[%c0, %arg2] : memref<2x32xf32, #gpu.address_space> + %6 = arith.addf %5, %cst_0 : f32 + %7 = memref.load %alloca_1[%c1, %arg2] : memref<2x32xf32, #gpu.address_space> + %8 = arith.addf %7, %6 : f32 + memref.store %8, %alloca[%arg2] : memref<32xf32, #gpu.address_space> + } {mapping = [#gpu.thread]} + %subview = memref.subview %alloca[0] [%3] [1] : memref<32xf32, #gpu.address_space> to memref, #gpu.address_space> + %subview_2 = memref.subview %alloc[%4] [%3] [1] : memref<1000xf32> to memref> + memref.copy %subview, %subview_2 : memref, #gpu.address_space> to memref> + } {mapping = [#gpu.block]} + return %alloc : memref<1000xf32> + } + func.func private @Unknown172(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} { %c1 = arith.constant 1 : index + %c1000 = arith.constant 1000 : index + %c0 = arith.constant 0 : index %alloc = memref.alloc() : memref<1000xf32> scf.for %arg1 = %c0 to %c1000 step %c1 { %0 = memref.load %arg0[%arg1] : memref<1000xf32> @@ -3654,344 +1839,340 @@ module @IrToMhlo.2452 { %alloc_0 = memref.alloc() : memref<4x64x112x112xf16> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc, %arg3, %arg4, %alloc_0) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x112x112xf16> %2 = call @Unknown3(%arg7) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %3 = call @Unknown4(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %4 = call @Unknown5(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %5 = call @Unknown6(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %3 = call @Unknown3(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %4 = call @Unknown3(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %5 = call @Unknown3(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %6 = call @Unknown7(%arg37) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> %7 = call @Unknown8(%arg27) : (memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> %8 = call @Unknown9(%arg32) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> - %9 = call @Unknown10(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> - %10 = call @Unknown11(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %9 = call @Unknown9(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %10 = call @Unknown9(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %11 = call @Unknown12(%arg62) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> %12 = call @Unknown13(%arg52) : (memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> %13 = call @Unknown14(%arg57) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> - %14 = call @Unknown15(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> - %15 = call @Unknown16(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %14 = call @Unknown14(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %15 = call @Unknown14(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %16 = call @Unknown17(%arg87) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> %17 = call @Unknown18(%arg77) : (memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> %18 = call @Unknown19(%arg82) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> - %19 = call @Unknown20(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> - %20 = call @Unknown21(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %19 = call @Unknown19(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %20 = call @Unknown19(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %21 = call @Unknown22(%arg1) : (memref<4x1000xf32>) -> memref<4x1000xf16> %22 = call @Unknown23(%arg102) : (memref<1000x512xf32>) -> memref<1000x512xf16> - %alloc_1 = memref.alloc() : memref<4xf16> - byre.compute @ReduceSumOp_f16_f16(%21, %alloc_1) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %23:2 = call @Unknown24(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) + %23 = call @Unknown24(%arg103) : (memref<1000xf32>) -> memref<1000xf16> + %24 = call @Unknown25(%21) : (memref<4x1000xf16>) -> memref<4xf16> + %25:2 = call @Unknown26(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) + %alloc_1 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @PoolMaxOp_f16_f16(%25#0, %alloc_1) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16> %alloc_2 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @PoolMaxOp_f16_f16(%23#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16> + byre.compute @ConvOp_f16f16_f16(%alloc_1, %2, %alloc_2) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_3 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%alloc_2, %2, %alloc_3) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_2, %arg8, %arg9, %alloc_3) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %26:2 = call @Unknown28(%alloc_3) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_4 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_3, %arg8, %arg9, %alloc_4) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %24:2 = call @Unknown26(%alloc_4) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%26#0, %3, %alloc_4) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_5 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%24#0, %3, %alloc_5) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_4, %arg13, %arg14, %alloc_5) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %27:2 = call @Unknown30(%alloc_5, %alloc_1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_6 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_5, %arg13, %arg14, %alloc_6) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %25:2 = call @Unknown28(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%27#0, %4, %alloc_6) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_7 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%25#0, %4, %alloc_7) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_6, %arg18, %arg19, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %28:2 = call @Unknown28(%alloc_7) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_8 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_7, %arg18, %arg19, %alloc_8) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %26:2 = call @Unknown30(%alloc_8) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%28#0, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_9 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%26#0, %5, %alloc_9) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_10 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_9, %arg23, %arg24, %alloc_10) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %27:2 = call @Unknown32(%alloc_10, %25#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_8, %arg23, %arg24, %alloc_9) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %29:2 = call @Unknown30(%alloc_9, %27#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + %alloc_10 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvOp_f16f16_f16(%29#0, %6, %alloc_10) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16> %alloc_11 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%27#0, %6, %alloc_11) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_10, %arg38, %arg39, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> %alloc_12 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_11, %arg38, %arg39, %alloc_12) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + byre.compute @ConvOp_f16f16_f16(%29#0, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16> %alloc_13 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%27#0, %7, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_12, %arg28, %arg29, %alloc_13) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %30:2 = call @Unknown37(%alloc_13) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_14 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_13, %arg28, %arg29, %alloc_14) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %28:2 = call @Unknown35(%alloc_14) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%30#0, %8, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_15 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%28#0, %8, %alloc_15) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_14, %arg33, %arg34, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %31:2 = call @Unknown39(%alloc_15, %alloc_11) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_16 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_15, %arg33, %arg34, %alloc_16) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %29:2 = call @Unknown37(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%31#0, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_17 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%29#0, %9, %alloc_17) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_16, %arg43, %arg44, %alloc_17) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %32:2 = call @Unknown37(%alloc_17) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_18 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_17, %arg43, %arg44, %alloc_18) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %30:2 = call @Unknown39(%alloc_18) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%32#0, %10, %alloc_18) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_19 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%30#0, %10, %alloc_19) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_20 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_19, %arg48, %arg49, %alloc_20) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %31:2 = call @Unknown41(%alloc_20, %29#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_18, %arg48, %arg49, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %33:2 = call @Unknown39(%alloc_19, %31#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + %alloc_20 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvOp_f16f16_f16(%33#0, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16> %alloc_21 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%31#0, %11, %alloc_21) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_20, %arg63, %arg64, %alloc_21) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> %alloc_22 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_21, %arg63, %arg64, %alloc_22) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + byre.compute @ConvOp_f16f16_f16(%33#0, %12, %alloc_22) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16> %alloc_23 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%31#0, %12, %alloc_23) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_22, %arg53, %arg54, %alloc_23) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %34:2 = call @Unknown46(%alloc_23) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_24 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_23, %arg53, %arg54, %alloc_24) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %32:2 = call @Unknown44(%alloc_24) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%34#0, %13, %alloc_24) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_25 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%32#0, %13, %alloc_25) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_24, %arg58, %arg59, %alloc_25) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %35:2 = call @Unknown48(%alloc_25, %alloc_21) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_26 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_25, %arg58, %arg59, %alloc_26) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %33:2 = call @Unknown46(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%35#0, %14, %alloc_26) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_27 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%33#0, %14, %alloc_27) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_26, %arg68, %arg69, %alloc_27) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %36:2 = call @Unknown46(%alloc_27) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_28 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_27, %arg68, %arg69, %alloc_28) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %34:2 = call @Unknown48(%alloc_28) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%36#0, %15, %alloc_28) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_29 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%34#0, %15, %alloc_29) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_30 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_29, %arg73, %arg74, %alloc_30) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %35:2 = call @Unknown50(%alloc_30, %33#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_28, %arg73, %arg74, %alloc_29) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %37:2 = call @Unknown48(%alloc_29, %35#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + %alloc_30 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvOp_f16f16_f16(%37#0, %16, %alloc_30) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16> %alloc_31 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%35#0, %16, %alloc_31) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_30, %arg88, %arg89, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> %alloc_32 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_31, %arg88, %arg89, %alloc_32) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + byre.compute @ConvOp_f16f16_f16(%37#0, %17, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16> %alloc_33 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%35#0, %17, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_32, %arg78, %arg79, %alloc_33) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %38:2 = call @Unknown55(%alloc_33) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_34 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_33, %arg78, %arg79, %alloc_34) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %36:2 = call @Unknown53(%alloc_34) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%38#0, %18, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_35 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%36#0, %18, %alloc_35) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_34, %arg83, %arg84, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %39:2 = call @Unknown57(%alloc_35, %alloc_31) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_36 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_35, %arg83, %arg84, %alloc_36) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %37:2 = call @Unknown55(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%39#0, %19, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_37 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%37#0, %19, %alloc_37) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_36, %arg93, %arg94, %alloc_37) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %40:2 = call @Unknown55(%alloc_37) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_38 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_37, %arg93, %arg94, %alloc_38) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %38:2 = call @Unknown57(%alloc_38) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%40#0, %20, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_39 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%38#0, %20, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_40 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_39, %arg98, %arg99, %alloc_40) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %39:2 = call @Unknown59(%alloc_40, %37#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_38, %arg98, %arg99, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %41:2 = call @Unknown57(%alloc_39, %39#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + %42 = call @Unknown62(%41#0) : (memref<4x512x7x7xf16>) -> memref<4x512xf16> + %43 = call @Unknown63(%42) : (memref<4x512xf16>) -> memref<4x512xf16> + %alloc_40 = memref.alloc() : memref<4x1000xf16> + byre.compute @MatmulOp_f16f16_f16(%43, %22, %alloc_40) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16> + %44 = call @Unknown64(%23, %alloc_40) : (memref<1000xf16>, memref<4x1000xf16>) -> memref<4x1000xf16> + %45 = call @Unknown65(%44) : (memref<4x1000xf16>) -> memref<4xf16> + %46 = call @Unknown66(%45, %44) : (memref<4xf16>, memref<4x1000xf16>) -> memref<4x1000xf16> + %47 = call @Unknown67(%46) : (memref<4x1000xf16>) -> memref<4xf16> + %48 = call @Unknown68(%47) : (memref<4xf16>) -> memref<4xf16> + %49:2 = call @Unknown69(%48, %46, %24, %21) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) %alloc_41 = memref.alloc() : memref<4x512xf16> - byre.compute @ReduceSumOp_f16_f16(%39#0, %alloc_41) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<4x512xf16> - %40 = call @Unknown60(%alloc_41) : (memref<4x512xf16>) -> memref<4x512xf16> - %alloc_42 = memref.alloc() : memref<4x1000xf16> - byre.compute @MatmulOp_f16f16_f16(%40, %22, %alloc_42) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16> - %41 = call @Unknown61(%arg103, %alloc_42) : (memref<1000xf32>, memref<4x1000xf16>) -> memref<4x1000xf16> - %alloc_43 = memref.alloc() : memref<4xf16> - byre.compute @ReduceMaxOp_f16_f16(%41, %alloc_43) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %42:2 = call @Unknown62(%alloc_43, %41) : (memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) - %alloc_44 = memref.alloc() : memref<4xf16> - byre.compute @ReduceSumOp_f16_f16(%42#1, %alloc_44) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %43:3 = call @Unknown63(%alloc_44, %42#0, %alloc_1, %21, %arg1) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>, memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) - %alloc_45 = memref.alloc() : memref<4x512xf16> - byre.compute @MatmulOp_f16f16_f16(%43#0, %22, %alloc_45) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16> - %44 = call @Unknown64(%alloc_45, %39#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> - %alloc_46 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_47 = memref.alloc() : memref<512xf32> + byre.compute @MatmulOp_f16f16_f16(%49#1, %22, %alloc_41) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16> + %50 = call @Unknown70(%alloc_41, %41#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> + %alloc_42 = memref.alloc() : memref<4x512x7x7xf16> + %alloc_43 = memref.alloc() : memref<512xf32> + %alloc_44 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_38, %arg98, %50, %alloc_42, %alloc_43, %alloc_44) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_45 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_42, %20, %alloc_45) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_46 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%40#0, %alloc_42, %alloc_46) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %51 = call @Unknown74(%40#1, %alloc_45) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> + %alloc_47 = memref.alloc() : memref<4x512x7x7xf16> %alloc_48 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %44, %alloc_46, %alloc_47, %alloc_48) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_49 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_46, %20, %alloc_49) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_50 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %45 = call @Unknown68(%38#1, %alloc_49) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> - %alloc_51 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_52 = memref.alloc() : memref<512xf32> + %alloc_49 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_36, %arg93, %51, %alloc_47, %alloc_48, %alloc_49) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_50 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_47, %19, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_51 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%39#0, %alloc_47, %alloc_51) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %52 = call @Unknown78(%50, %alloc_50, %39#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> + %alloc_52 = memref.alloc() : memref<4x512x7x7xf16> %alloc_53 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %45, %alloc_51, %alloc_52, %alloc_53) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_54 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_51, %19, %alloc_54) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_55 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %46 = call @Unknown72(%44, %alloc_54, %37#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> - %alloc_56 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_57 = memref.alloc() : memref<512xf32> + %alloc_54 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_34, %arg83, %52, %alloc_52, %alloc_53, %alloc_54) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_55 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_52, %18, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_56 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_52, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %53 = call @Unknown74(%38#1, %alloc_55) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> + %alloc_57 = memref.alloc() : memref<4x512x7x7xf16> %alloc_58 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %46, %alloc_56, %alloc_57, %alloc_58) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_59 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_56, %18, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_60 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %47 = call @Unknown76(%36#1, %alloc_59) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> - %alloc_61 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_62 = memref.alloc() : memref<512xf32> + %alloc_59 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_32, %arg78, %53, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_60 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_57, %17, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_61 = memref.alloc() : memref<512x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_57, %alloc_61) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16> + %alloc_62 = memref.alloc() : memref<4x512x7x7xf16> %alloc_63 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %47, %alloc_61, %alloc_62, %alloc_63) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_64 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_61, %17, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_65 = memref.alloc() : memref<512x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16> - %alloc_66 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_67 = memref.alloc() : memref<512xf32> - %alloc_68 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %46, %alloc_66, %alloc_67, %alloc_68) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_69 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_66, %16, %alloc_69) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16> - %alloc_70 = memref.alloc() : memref<512x256x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16> - %48 = call @Unknown83(%alloc_69, %alloc_64, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> - %alloc_71 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_72 = memref.alloc() : memref<256xf32> + %alloc_64 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_30, %arg88, %52, %alloc_62, %alloc_63, %alloc_64) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_65 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_62, %16, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16> + %alloc_66 = memref.alloc() : memref<512x256x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_62, %alloc_66) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16> + %54 = call @Unknown89(%alloc_65, %alloc_60, %37#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> + %alloc_67 = memref.alloc() : memref<4x256x14x14xf16> + %alloc_68 = memref.alloc() : memref<256xf32> + %alloc_69 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_28, %arg73, %54, %alloc_67, %alloc_68, %alloc_69) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_70 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_67, %15, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_71 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_67, %alloc_71) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %55 = call @Unknown93(%36#1, %alloc_70) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> + %alloc_72 = memref.alloc() : memref<4x256x14x14xf16> %alloc_73 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %48, %alloc_71, %alloc_72, %alloc_73) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_74 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_71, %15, %alloc_74) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_75 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %49 = call @Unknown87(%34#1, %alloc_74) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> - %alloc_76 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_77 = memref.alloc() : memref<256xf32> + %alloc_74 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_26, %arg68, %55, %alloc_72, %alloc_73, %alloc_74) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_75 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_72, %14, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_76 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_72, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %56 = call @Unknown89(%54, %alloc_75, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> + %alloc_77 = memref.alloc() : memref<4x256x14x14xf16> %alloc_78 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %49, %alloc_76, %alloc_77, %alloc_78) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_79 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_76, %14, %alloc_79) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_80 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %50 = call @Unknown91(%48, %alloc_79, %33#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> - %alloc_81 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_82 = memref.alloc() : memref<256xf32> + %alloc_79 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_24, %arg58, %56, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_80 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_77, %13, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_81 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_77, %alloc_81) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %57 = call @Unknown93(%34#1, %alloc_80) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> + %alloc_82 = memref.alloc() : memref<4x256x14x14xf16> %alloc_83 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %50, %alloc_81, %alloc_82, %alloc_83) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_84 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_81, %13, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_85 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %51 = call @Unknown95(%32#1, %alloc_84) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> - %alloc_86 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_87 = memref.alloc() : memref<256xf32> + %alloc_84 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_22, %arg53, %57, %alloc_82, %alloc_83, %alloc_84) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_85 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_82, %12, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_86 = memref.alloc() : memref<256x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_82, %alloc_86) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16> + %alloc_87 = memref.alloc() : memref<4x256x14x14xf16> %alloc_88 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %51, %alloc_86, %alloc_87, %alloc_88) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_89 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_86, %12, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_90 = memref.alloc() : memref<256x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16> - %alloc_91 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_92 = memref.alloc() : memref<256xf32> - %alloc_93 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %50, %alloc_91, %alloc_92, %alloc_93) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_94 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_91, %11, %alloc_94) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16> - %alloc_95 = memref.alloc() : memref<256x128x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16> - %52 = call @Unknown102(%alloc_94, %alloc_89, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> - %alloc_96 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_97 = memref.alloc() : memref<128xf32> + %alloc_89 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_20, %arg63, %56, %alloc_87, %alloc_88, %alloc_89) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_90 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_87, %11, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16> + %alloc_91 = memref.alloc() : memref<256x128x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_87, %alloc_91) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16> + %58 = call @Unknown108(%alloc_90, %alloc_85, %33#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> + %alloc_92 = memref.alloc() : memref<4x128x28x28xf16> + %alloc_93 = memref.alloc() : memref<128xf32> + %alloc_94 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_18, %arg48, %58, %alloc_92, %alloc_93, %alloc_94) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_95 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_92, %10, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_96 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_92, %alloc_96) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %59 = call @Unknown112(%32#1, %alloc_95) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> + %alloc_97 = memref.alloc() : memref<4x128x28x28xf16> %alloc_98 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %52, %alloc_96, %alloc_97, %alloc_98) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_99 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_96, %10, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_100 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %53 = call @Unknown106(%30#1, %alloc_99) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> - %alloc_101 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_102 = memref.alloc() : memref<128xf32> + %alloc_99 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_16, %arg43, %59, %alloc_97, %alloc_98, %alloc_99) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_100 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_97, %9, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_101 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_97, %alloc_101) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %60 = call @Unknown108(%58, %alloc_100, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> + %alloc_102 = memref.alloc() : memref<4x128x28x28xf16> %alloc_103 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %53, %alloc_101, %alloc_102, %alloc_103) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_104 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_101, %9, %alloc_104) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_105 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %54 = call @Unknown110(%52, %alloc_104, %29#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> - %alloc_106 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_107 = memref.alloc() : memref<128xf32> + %alloc_104 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_14, %arg33, %60, %alloc_102, %alloc_103, %alloc_104) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_105 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_102, %8, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_106 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_102, %alloc_106) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %61 = call @Unknown112(%30#1, %alloc_105) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> + %alloc_107 = memref.alloc() : memref<4x128x28x28xf16> %alloc_108 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %54, %alloc_106, %alloc_107, %alloc_108) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_109 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_106, %8, %alloc_109) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_110 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %55 = call @Unknown114(%28#1, %alloc_109) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> - %alloc_111 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_112 = memref.alloc() : memref<128xf32> + %alloc_109 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_12, %arg28, %61, %alloc_107, %alloc_108, %alloc_109) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_110 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_107, %7, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_111 = memref.alloc() : memref<128x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_107, %alloc_111) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16> + %alloc_112 = memref.alloc() : memref<4x128x28x28xf16> %alloc_113 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %55, %alloc_111, %alloc_112, %alloc_113) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_114 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_111, %7, %alloc_114) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_115 = memref.alloc() : memref<128x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16> - %alloc_116 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_117 = memref.alloc() : memref<128xf32> - %alloc_118 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %54, %alloc_116, %alloc_117, %alloc_118) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_119 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_116, %6, %alloc_119) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16> - %alloc_120 = memref.alloc() : memref<128x64x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16> - %56 = call @Unknown121(%alloc_119, %alloc_114, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> - %alloc_121 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_122 = memref.alloc() : memref<64xf32> + %alloc_114 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_10, %arg38, %60, %alloc_112, %alloc_113, %alloc_114) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_115 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_112, %6, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16> + %alloc_116 = memref.alloc() : memref<128x64x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_112, %alloc_116) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16> + %62 = call @Unknown127(%alloc_115, %alloc_110, %29#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> + %alloc_117 = memref.alloc() : memref<4x64x56x56xf16> + %alloc_118 = memref.alloc() : memref<64xf32> + %alloc_119 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_8, %arg23, %62, %alloc_117, %alloc_118, %alloc_119) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_120 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_117, %5, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_121 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_117, %alloc_121) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %63 = call @Unknown131(%28#1, %alloc_120) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_122 = memref.alloc() : memref<4x64x56x56xf16> %alloc_123 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %56, %alloc_121, %alloc_122, %alloc_123) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_124 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_121, %5, %alloc_124) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_125 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %57 = call @Unknown125(%26#1, %alloc_124) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_126 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_127 = memref.alloc() : memref<64xf32> + %alloc_124 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_6, %arg18, %63, %alloc_122, %alloc_123, %alloc_124) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_125 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_122, %4, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_126 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_122, %alloc_126) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %64 = call @Unknown127(%62, %alloc_125, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> + %alloc_127 = memref.alloc() : memref<4x64x56x56xf16> %alloc_128 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %57, %alloc_126, %alloc_127, %alloc_128) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_129 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_126, %4, %alloc_129) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_130 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %58 = call @Unknown129(%56, %alloc_129, %25#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> - %alloc_131 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_132 = memref.alloc() : memref<64xf32> + %alloc_129 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_4, %arg13, %64, %alloc_127, %alloc_128, %alloc_129) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_130 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_127, %3, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_131 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_127, %alloc_131) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %65 = call @Unknown131(%26#1, %alloc_130) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_132 = memref.alloc() : memref<4x64x56x56xf16> %alloc_133 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %58, %alloc_131, %alloc_132, %alloc_133) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_134 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_131, %3, %alloc_134) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_135 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%24#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %59 = call @Unknown133(%24#1, %alloc_134) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_136 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_137 = memref.alloc() : memref<64xf32> - %alloc_138 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %59, %alloc_136, %alloc_137, %alloc_138) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_139 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_136, %2, %alloc_139) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_140 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_2, %alloc_136, %alloc_140) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %60 = call @Unknown137(%58, %alloc_139) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_141 = memref.alloc() : memref<4x64x112x112xf16> - byre.compute @PoolMaxGradOp_f16f16_f16(%23#0, %60, %alloc_141) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16> - %61 = call @Unknown138(%23#1, %alloc_141) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> - %alloc_142 = memref.alloc() : memref<4x64x112x112xf16> - %alloc_143 = memref.alloc() : memref<64xf32> - %alloc_144 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %61, %alloc_142, %alloc_143, %alloc_144) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32> - %alloc_145 = memref.alloc() : memref<64x3x7x7xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_142, %alloc_145) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16> - %alloc_146 = memref.alloc() : memref - byre.compute @ReduceSumOp_f32_f32(%43#1, %alloc_146) {dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref - %62 = call @Unknown141(%alloc_146) : (memref) -> memref - %63 = call @Unknown142(%alloc_145) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> - %64 = call @Unknown143(%alloc_140) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %65 = call @Unknown144(%alloc_135) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %66 = call @Unknown145(%alloc_130) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %67 = call @Unknown146(%alloc_125) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %68 = call @Unknown147(%alloc_115) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> - %69 = call @Unknown148(%alloc_110) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %70 = call @Unknown149(%alloc_120) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> - %71 = call @Unknown150(%alloc_105) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %72 = call @Unknown151(%alloc_100) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %73 = call @Unknown152(%alloc_90) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> - %74 = call @Unknown153(%alloc_85) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %75 = call @Unknown154(%alloc_95) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> - %76 = call @Unknown155(%alloc_80) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %77 = call @Unknown156(%alloc_75) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %78 = call @Unknown157(%alloc_65) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> - %79 = call @Unknown158(%alloc_60) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %80 = call @Unknown159(%alloc_70) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> - %81 = call @Unknown160(%alloc_55) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %82 = call @Unknown161(%alloc_50) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %alloc_147 = memref.alloc() : memref<1000x512xf16> - byre.compute @MatmulOp_f16f16_f16(%40, %43#0, %alloc_147) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16> - %83 = call @Unknown163(%alloc_147) : (memref<1000x512xf16>) -> memref<1000x512xf32> - %alloc_148 = memref.alloc() : memref<1000xf32> - byre.compute @ReduceSumOp_f32_f32(%43#2, %alloc_148) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<1000xf32> - %84 = call @Unknown164(%alloc_148) : (memref<1000xf32>) -> memref<1000xf32> - return %62, %63, %alloc_143, %alloc_144, %64, %alloc_137, %alloc_138, %65, %alloc_132, %alloc_133, %66, %alloc_127, %alloc_128, %67, %alloc_122, %alloc_123, %68, %alloc_112, %alloc_113, %69, %alloc_107, %alloc_108, %70, %alloc_117, %alloc_118, %71, %alloc_102, %alloc_103, %72, %alloc_97, %alloc_98, %73, %alloc_87, %alloc_88, %74, %alloc_82, %alloc_83, %75, %alloc_92, %alloc_93, %76, %alloc_77, %alloc_78, %77, %alloc_72, %alloc_73, %78, %alloc_62, %alloc_63, %79, %alloc_57, %alloc_58, %80, %alloc_67, %alloc_68, %81, %alloc_52, %alloc_53, %82, %alloc_47, %alloc_48, %83, %84 : memref, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32> + %alloc_134 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_2, %arg8, %65, %alloc_132, %alloc_133, %alloc_134) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_135 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_132, %2, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_136 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_1, %alloc_132, %alloc_136) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %66 = call @Unknown143(%64, %alloc_135) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_137 = memref.alloc() : memref<4x64x112x112xf16> + byre.compute @PoolMaxGradOp_f16f16_f16(%25#0, %66, %alloc_137) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16> + %67 = call @Unknown144(%25#1, %alloc_137) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> + %alloc_138 = memref.alloc() : memref<4x64x112x112xf16> + %alloc_139 = memref.alloc() : memref<64xf32> + %alloc_140 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %67, %alloc_138, %alloc_139, %alloc_140) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32> + %alloc_141 = memref.alloc() : memref<64x3x7x7xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_138, %alloc_141) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16> + %68 = call @Unknown147(%49#0, %arg1) : (memref<4x1000xf16>, memref<4x1000xf32>) -> memref + %69 = call @Unknown148(%68) : (memref) -> memref + %70 = call @Unknown149(%alloc_141) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> + %71 = call @Unknown150(%alloc_136) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %72 = call @Unknown150(%alloc_131) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %73 = call @Unknown150(%alloc_126) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %74 = call @Unknown150(%alloc_121) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %75 = call @Unknown154(%alloc_111) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> + %76 = call @Unknown155(%alloc_106) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %77 = call @Unknown156(%alloc_116) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> + %78 = call @Unknown155(%alloc_101) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %79 = call @Unknown155(%alloc_96) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %80 = call @Unknown159(%alloc_86) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> + %81 = call @Unknown160(%alloc_81) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %82 = call @Unknown161(%alloc_91) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> + %83 = call @Unknown160(%alloc_76) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %84 = call @Unknown160(%alloc_71) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %85 = call @Unknown164(%alloc_61) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> + %86 = call @Unknown165(%alloc_56) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %87 = call @Unknown166(%alloc_66) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> + %88 = call @Unknown165(%alloc_51) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %89 = call @Unknown165(%alloc_46) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %alloc_142 = memref.alloc() : memref<1000x512xf16> + byre.compute @MatmulOp_f16f16_f16(%43, %49#1, %alloc_142) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16> + %90 = call @Unknown170(%alloc_142) : (memref<1000x512xf16>) -> memref<1000x512xf32> + %91 = call @Unknown171(%49#1) : (memref<4x1000xf16>) -> memref<1000xf32> + %92 = call @Unknown172(%91) : (memref<1000xf32>) -> memref<1000xf32> + return %69, %70, %alloc_139, %alloc_140, %71, %alloc_133, %alloc_134, %72, %alloc_128, %alloc_129, %73, %alloc_123, %alloc_124, %74, %alloc_118, %alloc_119, %75, %alloc_108, %alloc_109, %76, %alloc_103, %alloc_104, %77, %alloc_113, %alloc_114, %78, %alloc_98, %alloc_99, %79, %alloc_93, %alloc_94, %80, %alloc_83, %alloc_84, %81, %alloc_78, %alloc_79, %82, %alloc_88, %alloc_89, %83, %alloc_73, %alloc_74, %84, %alloc_68, %alloc_69, %85, %alloc_58, %alloc_59, %86, %alloc_53, %alloc_54, %87, %alloc_63, %alloc_64, %88, %alloc_48, %alloc_49, %89, %alloc_43, %alloc_44, %90, %92 : memref, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/Whole/7_set_space_opt.mlir b/compiler/test/E2E/ResNet18/Whole/7_set_space_opt.mlir index a4de5d088..c8fa12017 100644 --- a/compiler/test/E2E/ResNet18/Whole/7_set_space_opt.mlir +++ b/compiler/test/E2E/ResNet18/Whole/7_set_space_opt.mlir @@ -1,4673 +1,2788 @@ -// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s +// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -inline -gpu-launch-func-to-byre -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s // CHECK-LABEL: func.func @main module @IrToMhlo.2452 attributes {gpu.container_module} { gpu.module @unified { - gpu.func @Unknown164(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { + gpu.func @Unknown172(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<1000xf32> - %7 = arith.truncf %6 : f32 to f16 - %8 = arith.extf %7 : f16 to f32 - memref.store %8, %arg1[%4] : memref<1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg2] : memref<1000xf32> + %8 = arith.truncf %7 : f32 to f16 + %9 = arith.extf %8 : f16 to f32 + memref.store %9, %arg1[%arg2] : memref<1000xf32> } gpu.return } - gpu.func @Unknown163(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown170(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf32> } gpu.return } - gpu.func @Unknown161(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { + gpu.func @Unknown166(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { + %c131072 = arith.constant 131072 : index %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index + %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> } gpu.return } - gpu.func @Unknown160(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown165(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32> } gpu.return } - gpu.func @Unknown159(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c131072 = arith.constant 131072 : index + gpu.func @Unknown164(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { + %c1179648 = arith.constant 1179648 : index %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32> } gpu.return } - gpu.func @Unknown158(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { + gpu.func @Unknown161(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { + %c32768 = arith.constant 32768 : index %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index + %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> } gpu.return } - gpu.func @Unknown157(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown160(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { + %c589824 = arith.constant 589824 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32> } gpu.return } - gpu.func @Unknown156(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index + gpu.func @Unknown159(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { + %c294912 = arith.constant 294912 : index + %c128 = arith.constant 128 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32> } gpu.return } - gpu.func @Unknown155(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { + gpu.func @Unknown156(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { + %c8192 = arith.constant 8192 : index %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> } gpu.return } - gpu.func @Unknown154(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c32768 = arith.constant 32768 : index + gpu.func @Unknown155(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { + %c147456 = arith.constant 147456 : index %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32> } gpu.return } - gpu.func @Unknown153(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index + gpu.func @Unknown154(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { + %c73728 = arith.constant 73728 : index + %c64 = arith.constant 64 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32> } gpu.return } - gpu.func @Unknown152(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index + gpu.func @Unknown150(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { + %c36864 = arith.constant 36864 : index + %c64 = arith.constant 64 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32> } gpu.return } - gpu.func @Unknown151(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index + gpu.func @Unknown149(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { + %c9408 = arith.constant 9408 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32> } gpu.return } - gpu.func @Unknown150(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown148(%arg0: memref, %arg1: memref) kernel { + %c1 = arith.constant 1 : index + %cst = arith.constant 4.000000e+00 : f32 %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1 step %6 { + %7 = memref.load %arg0[] : memref + %8 = arith.negf %7 : f32 + %9 = arith.divf %8, %cst : f32 + memref.store %9, %arg1[] : memref } gpu.return } - gpu.func @Unknown149(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c8192 = arith.constant 8192 : index + gpu.func @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel { + %c3211264 = arith.constant 3211264 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index + %c112 = arith.constant 112 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c3211264 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xf16> } gpu.return } - gpu.func @Unknown148(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.addf %13, %14 : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown147(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown146(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown145(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { + %c401408 = arith.constant 401408 : index + %cst = arith.constant 0.000000e+00 : f16 + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16> } gpu.return } - gpu.func @Unknown144(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + %c401408 = arith.constant 401408 : index + %cst = arith.constant 0.000000e+00 : f16 + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xf16> } gpu.return } - gpu.func @Unknown143(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { + %c200704 = arith.constant 200704 : index + %cst = arith.constant 0.000000e+00 : f16 + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16> } gpu.return } - gpu.func @Unknown142(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index + gpu.func @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + %c200704 = arith.constant 200704 : index + %cst = arith.constant 0.000000e+00 : f16 + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xf16> } gpu.return } - gpu.func @Unknown141(%arg0: memref, %arg1: memref) kernel { - %cst = arith.constant 4.000000e+00 : f32 - %c1 = arith.constant 1 : index + gpu.func @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index + %cst = arith.constant 0.000000e+00 : f16 + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1 : index - scf.if %5 { - %6 = memref.load %arg0[] : memref - %7 = arith.negf %6 : f32 - %8 = arith.divf %7, %cst : f32 - memref.store %8, %arg1[] : memref + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel { + gpu.func @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c3211264 = arith.constant 3211264 : index - %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c3211264 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index + %cst = arith.constant 4.900000e+01 : f16 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11] : memref<4x512xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %15 = arith.divf %13, %cst : f16 + %16 = arith.select %14, %15, %cst_0 : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf16>, %arg5: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg6 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg6, %c1000 : index + %8 = arith.divsi %arg6, %c1000 : index + %9 = memref.load %arg2[%8] : memref<4xf16> + %10 = memref.load %arg0[%8] : memref<4xf16> + %11 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %12 = memref.load %arg3[%8, %7] : memref<4x1000xf16> + %13 = arith.subf %11, %10 : f16 + %14 = math.exp %13 : f16 + %15 = arith.mulf %14, %9 : f16 + %16 = arith.subf %12, %15 : f16 + memref.store %13, %arg4[%8, %7] : memref<4x1000xf16> + memref.store %16, %arg5[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown68(%arg0: memref<4xf16>, %arg1: memref<4xf16>) kernel { + %c4 = arith.constant 4 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c4 step %6 { + %7 = memref.load %arg0[%arg2] : memref<4xf16> + %8 = math.log %7 : f16 + memref.store %8, %arg1[%arg2] : memref<4xf16> } gpu.return } - gpu.func @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg3, %c1000 : index + %8 = arith.divsi %arg3, %c1000 : index + %9 = memref.load %arg0[%8] : memref<4xf16> + %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %11 = arith.subf %10, %9 : f16 + memref.store %11, %arg2[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg3, %c1000 : index + %8 = arith.divsi %arg3, %c1000 : index + %9 = memref.load %arg0[%7] : memref<1000xf16> + %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %11 = arith.addf %10, %9 : f16 + memref.store %11, %arg2[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown63(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel { + %c2048 = arith.constant 2048 : index + %cst = arith.constant 2.040100e-02 : f16 + %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2048 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<4x512xf16> + %10 = arith.mulf %9, %cst : f16 + memref.store %10, %arg1[%8, %7] : memref<4x512xf16> } gpu.return } - gpu.func @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xi1> } gpu.return } - gpu.func @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1> } gpu.return } - gpu.func @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xi1> } gpu.return } - gpu.func @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index + gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index + %cst = arith.constant 0.000000e+00 : f16 %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1> } gpu.return } - gpu.func @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { + %c401408 = arith.constant 401408 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xi1> } gpu.return } - gpu.func @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { + %c401408 = arith.constant 401408 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1> } gpu.return } - gpu.func @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xi1> } gpu.return } - gpu.func @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { + gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1> } gpu.return } - gpu.func @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel { + gpu.func @Unknown26(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel { + %c3211264 = arith.constant 3211264 : index %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant 4.900000e+01 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index + %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg0[%35, %29] : memref<4x512xf16> - %38 = arith.divf %37, %cst_0 : f16 - %39 = arith.select %36, %38, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c3211264 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xi1> } gpu.return } - gpu.func @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>, %arg5: memref<4x1000xf16>, %arg6: memref<4x1000xf32>, %arg7: memref<4x1000xf32>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index + gpu.func @Unknown24(%arg0: memref<1000xf32>, %arg1: memref<1000xf16>) kernel { %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg3[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %18 = memref.load %arg0[%15] : memref<4xf16> - %19 = memref.load %arg2[%15] : memref<4xf16> - %20 = memref.load %arg4[%15, %9] : memref<4x1000xf32> - %21 = math.log %18 : f16 - %22 = arith.subf %17, %21 : f16 - %23 = math.exp %22 : f16 - %24 = arith.mulf %23, %19 : f16 - %25 = arith.subf %16, %24 : f16 - %26 = arith.extf %22 : f16 to f32 - %27 = arith.mulf %26, %20 : f32 - %28 = arith.extf %25 : f16 to f32 - memref.store %25, %arg5[%15, %9] : memref<4x1000xf16> - memref.store %27, %arg6[%15, %9] : memref<4x1000xf32> - memref.store %28, %arg7[%15, %9] : memref<4x1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg2] : memref<1000xf32> + %8 = arith.truncf %7 : f32 to f16 + memref.store %8, %arg1[%arg2] : memref<1000xf16> } gpu.return } - gpu.func @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>, %arg3: memref<4x1000xf16>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown23(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { + %c512000 = arith.constant 512000 : index + %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg0[%15] : memref<4xf16> - %18 = arith.subf %16, %17 : f16 - %19 = math.exp %18 : f16 - memref.store %18, %arg2[%15, %9] : memref<4x1000xf16> - memref.store %19, %arg3[%15, %9] : memref<4x1000xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf16> } gpu.return } - gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown22(%arg0: memref<4x1000xf32>, %arg1: memref<4x1000xf16>) kernel { %c4000 = arith.constant 4000 : index + %cst = arith.constant -2.500000e-01 : f32 %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg0[%9] : memref<1000xf32> - %18 = arith.truncf %17 : f32 to f16 - %19 = arith.addf %16, %18 : f16 - memref.store %19, %arg2[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown60(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel { - %cst = arith.constant 2.040100e-02 : f16 - %c0 = arith.constant 0 : index - %c2048 = arith.constant 2048 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2048 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<4x512xf16> - %17 = arith.mulf %16, %cst : f16 - memref.store %17, %arg1[%15, %9] : memref<4x512xf16> - } - gpu.return - } - gpu.func @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg2, %c1000 : index + %8 = arith.divsi %arg2, %c1000 : index + %9 = memref.load %arg0[%8, %7] : memref<4x1000xf32> + %10 = arith.mulf %9, %cst : f32 + %11 = arith.truncf %10 : f32 to f16 + memref.store %11, %arg1[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown53(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown19(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { + %c2359296 = arith.constant 2359296 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16> } gpu.return } - gpu.func @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown18(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel { + %c1179648 = arith.constant 1179648 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16> } gpu.return } - gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 + gpu.func @Unknown17(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel { + %c131072 = arith.constant 131072 : index %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> } gpu.return } - gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown14(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { + %c589824 = arith.constant 589824 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16> } gpu.return } - gpu.func @Unknown44(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + gpu.func @Unknown13(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel { + %c294912 = arith.constant 294912 : index + %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16> } gpu.return } - gpu.func @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 + gpu.func @Unknown12(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel { + %c32768 = arith.constant 32768 : index %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> } gpu.return } - gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown9(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { + %c147456 = arith.constant 147456 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16> } gpu.return } - gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown8(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel { + %c73728 = arith.constant 73728 : index + %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16> } gpu.return } - gpu.func @Unknown35(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 + gpu.func @Unknown7(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel { + %c8192 = arith.constant 8192 : index %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> } gpu.return } - gpu.func @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { + %c36864 = arith.constant 36864 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16> } gpu.return } - gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { + %c9408 = arith.constant 9408 : index + %c3 = arith.constant 3 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16> } gpu.return } - gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel { + %c602112 = arith.constant 602112 : index + %c3 = arith.constant 3 : index + %c224 = arith.constant 224 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c602112 step %6 { + %7 = arith.remsi %arg2, %c224 : index + %8 = arith.divsi %arg2, %c224 : index + %9 = arith.remsi %8, %c224 : index + %10 = arith.divsi %8, %c224 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x3x224x224xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x3x224x224xf16> } gpu.return } - gpu.func @Unknown26(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 + gpu.func @Unknown25_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = gpu.thread_id x + %4 = gpu.thread_id y + %5 = gpu.thread_id z + %6 = gpu.grid_dim x + %7 = gpu.grid_dim y + %8 = gpu.grid_dim z + %9 = gpu.block_dim x + %10 = gpu.block_dim y + %11 = gpu.block_dim z + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index %c-1 = arith.constant -1 : index + %c512 = arith.constant 512 : index + %c-1024 = arith.constant -1024 : index + %c1000 = arith.constant 1000 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c128 = arith.constant 128 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - } - gpu.return - } - gpu.func @Unknown24(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel { + %c32 = arith.constant 32 : index + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %12 = gpu.block_id x + %subview = memref.subview %arg0[%12, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %13 = gpu.thread_id x + %14 = arith.muli %13, %c2 : index + %15 = arith.cmpi slt, %13, %c0 : index + %16 = arith.subi %c-1, %13 : index + %17 = arith.select %15, %16, %13 : index + %18 = arith.divsi %17, %c512 : index + %19 = arith.subi %c-1, %18 : index + %20 = arith.select %15, %19, %18 : index + %21 = arith.muli %20, %c-1024 : index + %22 = arith.addi %14, %21 : index + %23 = arith.cmpi slt, %22, %c1000 : index + %24 = arith.select %23, %22, %c1000 : index + %25 = arith.addi %22, %c2 : index + %26 = arith.cmpi slt, %25, %c1000 : index + %27 = arith.select %26, %25, %c1000 : index + %28 = arith.subi %27, %24 : index + %subview_0 = memref.subview %expand_shape[0, %24] [1, %28] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %29 = arith.cmpi ugt, %28, %c0 : index + %30 = scf.if %29 -> (f16) { + %44 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %44 : f16 + } else { + scf.yield %cst : f16 + } + %31 = arith.addf %30, %cst : f16 + %32 = arith.cmpi ugt, %28, %c1 : index + %33 = scf.if %32 -> (f16) { + %44 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %44 : f16 + } else { + scf.yield %cst : f16 + } + %34 = arith.addf %31, %33 : f16 + memref.store %34, %alloca[%13] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %35 = arith.cmpi ult, %13, %c256 : index + scf.if %35 { + %44 = memref.load %alloca[%14] : memref<512xf16, #gpu.address_space> + %45 = arith.addf %44, %cst : f16 + %46 = arith.addi %14, %c1 : index + %47 = memref.load %alloca[%46] : memref<512xf16, #gpu.address_space> + %48 = arith.addf %47, %45 : f16 + memref.store %48, %alloca_2[%13] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %36 = arith.cmpi ult, %13, %c128 : index + scf.if %36 { + %44 = memref.load %alloca_2[%14] : memref<256xf16, #gpu.address_space> + %45 = arith.addf %44, %cst : f16 + %46 = arith.addi %14, %c1 : index + %47 = memref.load %alloca_2[%46] : memref<256xf16, #gpu.address_space> + %48 = arith.addf %47, %45 : f16 + memref.store %48, %alloca_3[%13] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %37 = arith.cmpi ult, %13, %c64 : index + scf.if %37 { + %44 = memref.load %alloca_3[%14] : memref<128xf16, #gpu.address_space> + %45 = arith.addf %44, %cst : f16 + %46 = arith.addi %14, %c1 : index + %47 = memref.load %alloca_3[%46] : memref<128xf16, #gpu.address_space> + %48 = arith.addf %47, %45 : f16 + memref.store %48, %alloca_4[%13] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %38 = arith.cmpi ult, %13, %c32 : index + scf.if %38 { + %44 = memref.load %alloca_4[%14] : memref<64xf16, #gpu.address_space> + %45 = arith.addf %44, %cst : f16 + %46 = arith.addi %14, %c1 : index + %47 = memref.load %alloca_4[%46] : memref<64xf16, #gpu.address_space> + %48 = arith.addf %47, %45 : f16 + memref.store %48, %alloca_5[%13] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %39 = arith.cmpi ult, %13, %c16 : index + scf.if %39 { + %44 = memref.load %alloca_5[%14] : memref<32xf16, #gpu.address_space> + %45 = arith.addf %44, %cst : f16 + %46 = arith.addi %14, %c1 : index + %47 = memref.load %alloca_5[%46] : memref<32xf16, #gpu.address_space> + %48 = arith.addf %47, %45 : f16 + memref.store %48, %alloca_6[%13] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %40 = arith.cmpi ult, %13, %c8 : index + scf.if %40 { + %44 = memref.load %alloca_6[%14] : memref<16xf16, #gpu.address_space> + %45 = arith.addf %44, %cst : f16 + %46 = arith.addi %14, %c1 : index + %47 = memref.load %alloca_6[%46] : memref<16xf16, #gpu.address_space> + %48 = arith.addf %47, %45 : f16 + memref.store %48, %alloca_7[%13] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %41 = arith.cmpi ult, %13, %c4 : index + scf.if %41 { + %44 = memref.load %alloca_7[%14] : memref<8xf16, #gpu.address_space> + %45 = arith.addf %44, %cst : f16 + %46 = arith.addi %14, %c1 : index + %47 = memref.load %alloca_7[%46] : memref<8xf16, #gpu.address_space> + %48 = arith.addf %47, %45 : f16 + memref.store %48, %alloca_8[%13] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %42 = arith.cmpi ult, %13, %c2 : index + scf.if %42 { + %44 = memref.load %alloca_8[%14] : memref<4xf16, #gpu.address_space> + %45 = arith.addf %44, %cst : f16 + %46 = arith.addi %14, %c1 : index + %47 = memref.load %alloca_8[%46] : memref<4xf16, #gpu.address_space> + %48 = arith.addf %47, %45 : f16 + memref.store %48, %alloca_9[%13] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %43 = arith.cmpi ult, %13, %c1 : index + scf.if %43 { + %44 = memref.load %alloca_9[%14] : memref<2xf16, #gpu.address_space> + %45 = arith.addf %44, %cst : f16 + %46 = arith.addi %14, %c1 : index + %47 = memref.load %alloca_9[%46] : memref<2xf16, #gpu.address_space> + %48 = arith.addf %47, %45 : f16 + memref.store %48, %arg1[%12] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown62_kernel(%arg0: memref<2048x49xf16>, %arg1: memref<2048xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = gpu.thread_id x + %4 = gpu.thread_id y + %5 = gpu.thread_id z + %6 = gpu.grid_dim x + %7 = gpu.grid_dim y + %8 = gpu.grid_dim z + %9 = gpu.block_dim x + %10 = gpu.block_dim y + %11 = gpu.block_dim z + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %c49 = arith.constant 49 : index + %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f16 + %c32 = arith.constant 32 : index + %c2 = arith.constant 2 : index + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %12 = gpu.block_id x + %subview = memref.subview %arg0[%12, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + %13 = gpu.thread_id x + %14 = arith.remsi %13, %c64 : index + %15 = arith.cmpi slt, %14, %c0 : index + %16 = arith.addi %14, %c64 : index + %17 = arith.select %15, %16, %14 : index + %18 = arith.cmpi slt, %17, %c49 : index + %19 = arith.select %18, %17, %c49 : index + %20 = arith.addi %17, %c1 : index + %21 = arith.cmpi slt, %20, %c49 : index + %22 = arith.select %21, %20, %c49 : index + %23 = arith.subi %22, %19 : index + %subview_0 = memref.subview %expand_shape[0, %19] [1, %23] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %24 = arith.cmpi ugt, %23, %c0 : index + %25 = scf.if %24 -> (f16) { + %33 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %33 : f16 + } else { + scf.yield %cst : f16 + } + %26 = arith.addf %25, %cst : f16 + memref.store %26, %alloca[%13] : memref<64xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space> + %27 = arith.cmpi ult, %13, %c32 : index + scf.if %27 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca[%33] : memref<64xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca[%36] : memref<64xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_2[%13] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space> + %28 = arith.cmpi ult, %13, %c16 : index + scf.if %28 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca_2[%33] : memref<32xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca_2[%36] : memref<32xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_3[%13] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space> + %29 = arith.cmpi ult, %13, %c8 : index + scf.if %29 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca_3[%33] : memref<16xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca_3[%36] : memref<16xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_4[%13] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space> + %30 = arith.cmpi ult, %13, %c4 : index + scf.if %30 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca_4[%33] : memref<8xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca_4[%36] : memref<8xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_5[%13] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space> + %31 = arith.cmpi ult, %13, %c2 : index + scf.if %31 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca_5[%33] : memref<4xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca_5[%36] : memref<4xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_6[%13] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %32 = arith.cmpi ult, %13, %c1 : index + scf.if %32 { + %33 = arith.muli %13, %c2 : index + %34 = memref.load %alloca_6[%33] : memref<2xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %33, %c1 : index + %37 = memref.load %alloca_6[%36] : memref<2xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %arg1[%12] : memref<2048xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown65_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = gpu.thread_id x + %4 = gpu.thread_id y + %5 = gpu.thread_id z + %6 = gpu.grid_dim x + %7 = gpu.grid_dim y + %8 = gpu.grid_dim z + %9 = gpu.block_dim x + %10 = gpu.block_dim y + %11 = gpu.block_dim z + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index - %c3211264 = arith.constant 3211264 : index - %c112 = arith.constant 112 : index %c-1 = arith.constant -1 : index + %c512 = arith.constant 512 : index + %c-1024 = arith.constant -1024 : index + %c1000 = arith.constant 1000 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %c128 = arith.constant 128 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c3211264 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xi1> - } - gpu.return - } - gpu.func @Unknown23(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { + %c32 = arith.constant 32 : index + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %12 = gpu.block_id x + %subview = memref.subview %arg0[%12, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %13 = gpu.thread_id x + %14 = arith.muli %13, %c2 : index + %15 = arith.cmpi slt, %13, %c0 : index + %16 = arith.subi %c-1, %13 : index + %17 = arith.select %15, %16, %13 : index + %18 = arith.divsi %17, %c512 : index + %19 = arith.subi %c-1, %18 : index + %20 = arith.select %15, %19, %18 : index + %21 = arith.muli %20, %c-1024 : index + %22 = arith.addi %14, %21 : index + %23 = arith.cmpi slt, %22, %c1000 : index + %24 = arith.select %23, %22, %c1000 : index + %25 = arith.addi %22, %c2 : index + %26 = arith.cmpi slt, %25, %c1000 : index + %27 = arith.select %26, %25, %c1000 : index + %28 = arith.subi %27, %24 : index + %subview_0 = memref.subview %expand_shape[0, %24] [1, %28] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %29 = arith.cmpi ugt, %28, %c0 : index + %30 = scf.if %29 -> (f16) { + %43 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %43 : f16 + } else { + scf.yield %cst : f16 + } + %31 = arith.cmpi ugt, %28, %c1 : index + %32 = scf.if %31 -> (f16) { + %43 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %43 : f16 + } else { + scf.yield %cst : f16 + } + %33 = arith.maximumf %30, %32 : f16 + memref.store %33, %alloca[%13] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %34 = arith.cmpi ult, %13, %c256 : index + scf.if %34 { + %43 = memref.load %alloca[%14] : memref<512xf16, #gpu.address_space> + %44 = arith.addi %14, %c1 : index + %45 = memref.load %alloca[%44] : memref<512xf16, #gpu.address_space> + %46 = arith.maximumf %45, %43 : f16 + memref.store %46, %alloca_2[%13] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %35 = arith.cmpi ult, %13, %c128 : index + scf.if %35 { + %43 = memref.load %alloca_2[%14] : memref<256xf16, #gpu.address_space> + %44 = arith.addi %14, %c1 : index + %45 = memref.load %alloca_2[%44] : memref<256xf16, #gpu.address_space> + %46 = arith.maximumf %45, %43 : f16 + memref.store %46, %alloca_3[%13] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %36 = arith.cmpi ult, %13, %c64 : index + scf.if %36 { + %43 = memref.load %alloca_3[%14] : memref<128xf16, #gpu.address_space> + %44 = arith.addi %14, %c1 : index + %45 = memref.load %alloca_3[%44] : memref<128xf16, #gpu.address_space> + %46 = arith.maximumf %45, %43 : f16 + memref.store %46, %alloca_4[%13] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %37 = arith.cmpi ult, %13, %c32 : index + scf.if %37 { + %43 = memref.load %alloca_4[%14] : memref<64xf16, #gpu.address_space> + %44 = arith.addi %14, %c1 : index + %45 = memref.load %alloca_4[%44] : memref<64xf16, #gpu.address_space> + %46 = arith.maximumf %45, %43 : f16 + memref.store %46, %alloca_5[%13] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %38 = arith.cmpi ult, %13, %c16 : index + scf.if %38 { + %43 = memref.load %alloca_5[%14] : memref<32xf16, #gpu.address_space> + %44 = arith.addi %14, %c1 : index + %45 = memref.load %alloca_5[%44] : memref<32xf16, #gpu.address_space> + %46 = arith.maximumf %45, %43 : f16 + memref.store %46, %alloca_6[%13] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %39 = arith.cmpi ult, %13, %c8 : index + scf.if %39 { + %43 = memref.load %alloca_6[%14] : memref<16xf16, #gpu.address_space> + %44 = arith.addi %14, %c1 : index + %45 = memref.load %alloca_6[%44] : memref<16xf16, #gpu.address_space> + %46 = arith.maximumf %45, %43 : f16 + memref.store %46, %alloca_7[%13] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %40 = arith.cmpi ult, %13, %c4 : index + scf.if %40 { + %43 = memref.load %alloca_7[%14] : memref<8xf16, #gpu.address_space> + %44 = arith.addi %14, %c1 : index + %45 = memref.load %alloca_7[%44] : memref<8xf16, #gpu.address_space> + %46 = arith.maximumf %45, %43 : f16 + memref.store %46, %alloca_8[%13] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %41 = arith.cmpi ult, %13, %c2 : index + scf.if %41 { + %43 = memref.load %alloca_8[%14] : memref<4xf16, #gpu.address_space> + %44 = arith.addi %14, %c1 : index + %45 = memref.load %alloca_8[%44] : memref<4xf16, #gpu.address_space> + %46 = arith.maximumf %45, %43 : f16 + memref.store %46, %alloca_9[%13] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %42 = arith.cmpi ult, %13, %c1 : index + scf.if %42 { + %43 = memref.load %alloca_9[%14] : memref<2xf16, #gpu.address_space> + %44 = arith.addi %14, %c1 : index + %45 = memref.load %alloca_9[%44] : memref<2xf16, #gpu.address_space> + %46 = arith.maximumf %45, %43 : f16 + memref.store %46, %arg1[%12] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown67_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = gpu.thread_id x + %4 = gpu.thread_id y + %5 = gpu.thread_id z + %6 = gpu.grid_dim x + %7 = gpu.grid_dim y + %8 = gpu.grid_dim z + %9 = gpu.block_dim x + %10 = gpu.block_dim y + %11 = gpu.block_dim z + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %c2 = arith.constant 2 : index %c0 = arith.constant 0 : index - %c512000 = arith.constant 512000 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf16> - } - gpu.return - } - gpu.func @Unknown22(%arg0: memref<4x1000xf32>, %arg1: memref<4x1000xf16>) kernel { - %cst = arith.constant -2.500000e-01 : f32 - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<4x1000xf32> - %17 = arith.mulf %16, %cst : f32 - %18 = arith.truncf %17 : f32 to f16 - memref.store %18, %arg1[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown21(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown20(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown19(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown18(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown17(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel { - %c0 = arith.constant 0 : index - %c131072 = arith.constant 131072 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - } - gpu.return - } - gpu.func @Unknown16(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown15(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown14(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + %c-1024 = arith.constant -1024 : index + %c1000 = arith.constant 1000 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown13(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown12(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel { - %c0 = arith.constant 0 : index - %c32768 = arith.constant 32768 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - } - gpu.return - } - gpu.func @Unknown11(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown10(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown9(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown8(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown7(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel { - %c0 = arith.constant 0 : index - %c8192 = arith.constant 8192 : index - %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - } - gpu.return - } - gpu.func @Unknown6(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown5(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { + %c32 = arith.constant 32 : index + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %12 = gpu.block_id x + %subview = memref.subview %arg0[%12, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %13 = gpu.thread_id x + %14 = arith.muli %13, %c2 : index + %15 = arith.cmpi slt, %13, %c0 : index + %16 = arith.subi %c-1, %13 : index + %17 = arith.select %15, %16, %13 : index + %18 = arith.divsi %17, %c512 : index + %19 = arith.subi %c-1, %18 : index + %20 = arith.select %15, %19, %18 : index + %21 = arith.muli %20, %c-1024 : index + %22 = arith.addi %14, %21 : index + %23 = arith.cmpi slt, %22, %c1000 : index + %24 = arith.select %23, %22, %c1000 : index + %25 = arith.addi %22, %c2 : index + %26 = arith.cmpi slt, %25, %c1000 : index + %27 = arith.select %26, %25, %c1000 : index + %28 = arith.subi %27, %24 : index + %subview_0 = memref.subview %expand_shape[0, %24] [1, %28] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %29 = arith.cmpi ugt, %28, %c0 : index + %30 = scf.if %29 -> (f16) { + %46 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %46 : f16 + } else { + scf.yield %cst : f16 + } + %31 = math.exp %30 : f16 + %32 = arith.addf %31, %cst : f16 + %33 = arith.cmpi ugt, %28, %c1 : index + %34 = scf.if %33 -> (f16) { + %46 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %46 : f16 + } else { + scf.yield %cst : f16 + } + %35 = math.exp %34 : f16 + %36 = arith.addf %32, %35 : f16 + memref.store %36, %alloca[%13] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %37 = arith.cmpi ult, %13, %c256 : index + scf.if %37 { + %46 = memref.load %alloca[%14] : memref<512xf16, #gpu.address_space> + %47 = arith.addf %46, %cst : f16 + %48 = arith.addi %14, %c1 : index + %49 = memref.load %alloca[%48] : memref<512xf16, #gpu.address_space> + %50 = arith.addf %49, %47 : f16 + memref.store %50, %alloca_2[%13] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %38 = arith.cmpi ult, %13, %c128 : index + scf.if %38 { + %46 = memref.load %alloca_2[%14] : memref<256xf16, #gpu.address_space> + %47 = arith.addf %46, %cst : f16 + %48 = arith.addi %14, %c1 : index + %49 = memref.load %alloca_2[%48] : memref<256xf16, #gpu.address_space> + %50 = arith.addf %49, %47 : f16 + memref.store %50, %alloca_3[%13] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %39 = arith.cmpi ult, %13, %c64 : index + scf.if %39 { + %46 = memref.load %alloca_3[%14] : memref<128xf16, #gpu.address_space> + %47 = arith.addf %46, %cst : f16 + %48 = arith.addi %14, %c1 : index + %49 = memref.load %alloca_3[%48] : memref<128xf16, #gpu.address_space> + %50 = arith.addf %49, %47 : f16 + memref.store %50, %alloca_4[%13] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %40 = arith.cmpi ult, %13, %c32 : index + scf.if %40 { + %46 = memref.load %alloca_4[%14] : memref<64xf16, #gpu.address_space> + %47 = arith.addf %46, %cst : f16 + %48 = arith.addi %14, %c1 : index + %49 = memref.load %alloca_4[%48] : memref<64xf16, #gpu.address_space> + %50 = arith.addf %49, %47 : f16 + memref.store %50, %alloca_5[%13] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %41 = arith.cmpi ult, %13, %c16 : index + scf.if %41 { + %46 = memref.load %alloca_5[%14] : memref<32xf16, #gpu.address_space> + %47 = arith.addf %46, %cst : f16 + %48 = arith.addi %14, %c1 : index + %49 = memref.load %alloca_5[%48] : memref<32xf16, #gpu.address_space> + %50 = arith.addf %49, %47 : f16 + memref.store %50, %alloca_6[%13] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %42 = arith.cmpi ult, %13, %c8 : index + scf.if %42 { + %46 = memref.load %alloca_6[%14] : memref<16xf16, #gpu.address_space> + %47 = arith.addf %46, %cst : f16 + %48 = arith.addi %14, %c1 : index + %49 = memref.load %alloca_6[%48] : memref<16xf16, #gpu.address_space> + %50 = arith.addf %49, %47 : f16 + memref.store %50, %alloca_7[%13] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %43 = arith.cmpi ult, %13, %c4 : index + scf.if %43 { + %46 = memref.load %alloca_7[%14] : memref<8xf16, #gpu.address_space> + %47 = arith.addf %46, %cst : f16 + %48 = arith.addi %14, %c1 : index + %49 = memref.load %alloca_7[%48] : memref<8xf16, #gpu.address_space> + %50 = arith.addf %49, %47 : f16 + memref.store %50, %alloca_8[%13] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %44 = arith.cmpi ult, %13, %c2 : index + scf.if %44 { + %46 = memref.load %alloca_8[%14] : memref<4xf16, #gpu.address_space> + %47 = arith.addf %46, %cst : f16 + %48 = arith.addi %14, %c1 : index + %49 = memref.load %alloca_8[%48] : memref<4xf16, #gpu.address_space> + %50 = arith.addf %49, %47 : f16 + memref.store %50, %alloca_9[%13] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %45 = arith.cmpi ult, %13, %c1 : index + scf.if %45 { + %46 = memref.load %alloca_9[%14] : memref<2xf16, #gpu.address_space> + %47 = arith.addf %46, %cst : f16 + %48 = arith.addi %14, %c1 : index + %49 = memref.load %alloca_9[%48] : memref<2xf16, #gpu.address_space> + %50 = arith.addf %49, %47 : f16 + memref.store %50, %arg1[%12] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown147_kernel(%arg0: memref<32x125xf16>, %arg1: memref<32x125xf32>, %arg2: memref<32xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = gpu.thread_id x + %4 = gpu.thread_id y + %5 = gpu.thread_id z + %6 = gpu.grid_dim x + %7 = gpu.grid_dim y + %8 = gpu.grid_dim z + %9 = gpu.block_dim x + %10 = gpu.block_dim y + %11 = gpu.block_dim z + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %c128 = arith.constant 128 : index %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + %c125 = arith.constant 125 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %cst_0 = arith.constant 0.000000e+00 : f32 %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16> - } - gpu.return - } - gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel { + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %12 = gpu.block_id x + %subview = memref.subview %arg0[%12, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>> + %subview_1 = memref.subview %arg1[%12, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>> + %expand_shape_2 = memref.expand_shape %subview_1 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>> + %alloca = memref.alloca() : memref<128xf32, #gpu.address_space> + %13 = gpu.thread_id x + %14 = arith.remsi %13, %c128 : index + %15 = arith.cmpi slt, %14, %c0 : index + %16 = arith.addi %14, %c128 : index + %17 = arith.select %15, %16, %14 : index + %18 = arith.cmpi slt, %17, %c125 : index + %19 = arith.select %18, %17, %c125 : index + %20 = arith.addi %17, %c1 : index + %21 = arith.cmpi slt, %20, %c125 : index + %22 = arith.select %21, %20, %c125 : index + %23 = arith.subi %22, %19 : index + %subview_3 = memref.subview %expand_shape[0, %19] [1, %23] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref> + %expand_shape_4 = memref.expand_shape %subview_3 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %subview_5 = memref.subview %expand_shape_2[0, %19] [1, %23] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref> + %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref> into memref<1x?xf32, strided<[?, 1], offset: ?>> + %24 = arith.cmpi ugt, %23, %c0 : index + %25:2 = scf.if %24 -> (f16, f32) { + %36 = memref.load %expand_shape_4[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + %37 = memref.load %expand_shape_6[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>> + scf.yield %36, %37 : f16, f32 + } else { + scf.yield %cst, %cst_0 : f16, f32 + } + %26 = arith.extf %25#0 : f16 to f32 + %27 = arith.mulf %26, %25#1 : f32 + %28 = arith.addf %27, %cst_0 : f32 + memref.store %28, %alloca[%13] : memref<128xf32, #gpu.address_space> + gpu.barrier + %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space> + %29 = arith.cmpi ult, %13, %c64 : index + scf.if %29 { + %36 = arith.muli %13, %c2 : index + %37 = memref.load %alloca[%36] : memref<128xf32, #gpu.address_space> + %38 = arith.addf %37, %cst_0 : f32 + %39 = arith.addi %36, %c1 : index + %40 = memref.load %alloca[%39] : memref<128xf32, #gpu.address_space> + %41 = arith.addf %40, %38 : f32 + memref.store %41, %alloca_7[%13] : memref<64xf32, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space> + %30 = arith.cmpi ult, %13, %c32 : index + scf.if %30 { + %36 = arith.muli %13, %c2 : index + %37 = memref.load %alloca_7[%36] : memref<64xf32, #gpu.address_space> + %38 = arith.addf %37, %cst_0 : f32 + %39 = arith.addi %36, %c1 : index + %40 = memref.load %alloca_7[%39] : memref<64xf32, #gpu.address_space> + %41 = arith.addf %40, %38 : f32 + memref.store %41, %alloca_8[%13] : memref<32xf32, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space> + %31 = arith.cmpi ult, %13, %c16 : index + scf.if %31 { + %36 = arith.muli %13, %c2 : index + %37 = memref.load %alloca_8[%36] : memref<32xf32, #gpu.address_space> + %38 = arith.addf %37, %cst_0 : f32 + %39 = arith.addi %36, %c1 : index + %40 = memref.load %alloca_8[%39] : memref<32xf32, #gpu.address_space> + %41 = arith.addf %40, %38 : f32 + memref.store %41, %alloca_9[%13] : memref<16xf32, #gpu.address_space> + } + gpu.barrier + %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space> + %32 = arith.cmpi ult, %13, %c8 : index + scf.if %32 { + %36 = arith.muli %13, %c2 : index + %37 = memref.load %alloca_9[%36] : memref<16xf32, #gpu.address_space> + %38 = arith.addf %37, %cst_0 : f32 + %39 = arith.addi %36, %c1 : index + %40 = memref.load %alloca_9[%39] : memref<16xf32, #gpu.address_space> + %41 = arith.addf %40, %38 : f32 + memref.store %41, %alloca_10[%13] : memref<8xf32, #gpu.address_space> + } + gpu.barrier + %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space> + %33 = arith.cmpi ult, %13, %c4 : index + scf.if %33 { + %36 = arith.muli %13, %c2 : index + %37 = memref.load %alloca_10[%36] : memref<8xf32, #gpu.address_space> + %38 = arith.addf %37, %cst_0 : f32 + %39 = arith.addi %36, %c1 : index + %40 = memref.load %alloca_10[%39] : memref<8xf32, #gpu.address_space> + %41 = arith.addf %40, %38 : f32 + memref.store %41, %alloca_11[%13] : memref<4xf32, #gpu.address_space> + } + gpu.barrier + %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space> + %34 = arith.cmpi ult, %13, %c2 : index + scf.if %34 { + %36 = arith.muli %13, %c2 : index + %37 = memref.load %alloca_11[%36] : memref<4xf32, #gpu.address_space> + %38 = arith.addf %37, %cst_0 : f32 + %39 = arith.addi %36, %c1 : index + %40 = memref.load %alloca_11[%39] : memref<4xf32, #gpu.address_space> + %41 = arith.addf %40, %38 : f32 + memref.store %41, %alloca_12[%13] : memref<2xf32, #gpu.address_space> + } + gpu.barrier + %35 = arith.cmpi ult, %13, %c1 : index + scf.if %35 { + %36 = arith.muli %13, %c2 : index + %37 = memref.load %alloca_12[%36] : memref<2xf32, #gpu.address_space> + %38 = arith.addf %37, %cst_0 : f32 + %39 = arith.addi %36, %c1 : index + %40 = memref.load %alloca_12[%39] : memref<2xf32, #gpu.address_space> + %41 = arith.addf %40, %38 : f32 + memref.store %41, %arg2[%12] : memref<32xf32> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown147_kernel_0(%arg0: memref<32xf32>, %arg1: memref) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = gpu.thread_id x + %4 = gpu.thread_id y + %5 = gpu.thread_id z + %6 = gpu.grid_dim x + %7 = gpu.grid_dim y + %8 = gpu.grid_dim z + %9 = gpu.block_dim x + %10 = gpu.block_dim y + %11 = gpu.block_dim z + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %c32 = arith.constant 32 : index + %cst = arith.constant 0.000000e+00 : f32 + %c16 = arith.constant 16 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %12 = gpu.block_id x + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %13 = gpu.thread_id x + %14 = arith.muli %12, %c32 : index + %15 = arith.addi %14, %13 : index + %16 = memref.load %arg0[%15] : memref<32xf32> + %17 = arith.addf %16, %cst : f32 + memref.store %17, %alloca[%13] : memref<32xf32, #gpu.address_space> + gpu.barrier + %alloca_0 = memref.alloca() : memref<16xf32, #gpu.address_space> + %18 = arith.cmpi ult, %13, %c16 : index + scf.if %18 { + %23 = arith.muli %13, %c2 : index + %24 = memref.load %alloca[%23] : memref<32xf32, #gpu.address_space> + %25 = arith.addf %24, %cst : f32 + %26 = arith.addi %23, %c1 : index + %27 = memref.load %alloca[%26] : memref<32xf32, #gpu.address_space> + %28 = arith.addf %27, %25 : f32 + memref.store %28, %alloca_0[%13] : memref<16xf32, #gpu.address_space> + } + gpu.barrier + %alloca_1 = memref.alloca() : memref<8xf32, #gpu.address_space> + %19 = arith.cmpi ult, %13, %c8 : index + scf.if %19 { + %23 = arith.muli %13, %c2 : index + %24 = memref.load %alloca_0[%23] : memref<16xf32, #gpu.address_space> + %25 = arith.addf %24, %cst : f32 + %26 = arith.addi %23, %c1 : index + %27 = memref.load %alloca_0[%26] : memref<16xf32, #gpu.address_space> + %28 = arith.addf %27, %25 : f32 + memref.store %28, %alloca_1[%13] : memref<8xf32, #gpu.address_space> + } + gpu.barrier + %alloca_2 = memref.alloca() : memref<4xf32, #gpu.address_space> + %20 = arith.cmpi ult, %13, %c4 : index + scf.if %20 { + %23 = arith.muli %13, %c2 : index + %24 = memref.load %alloca_1[%23] : memref<8xf32, #gpu.address_space> + %25 = arith.addf %24, %cst : f32 + %26 = arith.addi %23, %c1 : index + %27 = memref.load %alloca_1[%26] : memref<8xf32, #gpu.address_space> + %28 = arith.addf %27, %25 : f32 + memref.store %28, %alloca_2[%13] : memref<4xf32, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<2xf32, #gpu.address_space> + %21 = arith.cmpi ult, %13, %c2 : index + scf.if %21 { + %23 = arith.muli %13, %c2 : index + %24 = memref.load %alloca_2[%23] : memref<4xf32, #gpu.address_space> + %25 = arith.addf %24, %cst : f32 + %26 = arith.addi %23, %c1 : index + %27 = memref.load %alloca_2[%26] : memref<4xf32, #gpu.address_space> + %28 = arith.addf %27, %25 : f32 + memref.store %28, %alloca_3[%13] : memref<2xf32, #gpu.address_space> + } + gpu.barrier + %22 = arith.cmpi ult, %13, %c1 : index + scf.if %22 { + %23 = arith.muli %13, %c2 : index + %24 = memref.load %alloca_3[%23] : memref<2xf32, #gpu.address_space> + %25 = arith.addf %24, %cst : f32 + %26 = arith.addi %23, %c1 : index + %27 = memref.load %alloca_3[%26] : memref<2xf32, #gpu.address_space> + %28 = arith.addf %27, %25 : f32 + memref.store %28, %arg1[] : memref + } + gpu.barrier + gpu.return + } + gpu.func @Unknown171_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<1000xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = gpu.thread_id x + %4 = gpu.thread_id y + %5 = gpu.thread_id z + %6 = gpu.grid_dim x + %7 = gpu.grid_dim y + %8 = gpu.grid_dim z + %9 = gpu.block_dim x + %10 = gpu.block_dim y + %11 = gpu.block_dim z + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %c-32 = arith.constant -32 : index + %c1000 = arith.constant 1000 : index + %c32 = arith.constant 32 : index + %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index - %c602112 = arith.constant 602112 : index - %c224 = arith.constant 224 : index - %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c602112 : index - scf.if %5 { - %6 = arith.remsi %4, %c224 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c224 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c224 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c224 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c224 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c224 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x3x224x224xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x3x224x224xf16> - } - gpu.return - } - } - func.func private @Unknown0(%arg0: memref<4x3x224x224xf32>) -> memref<4x3x224x224xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4704 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + %c2 = arith.constant 2 : index + %cst = arith.constant 0.000000e+00 : f16 + %cst_0 = arith.constant 0.000000e+00 : f32 + %12 = gpu.block_id x + %13 = arith.muli %12, %c-32 : index + %14 = arith.addi %13, %c1000 : index + %15 = arith.cmpi slt, %14, %c32 : index + %16 = arith.select %15, %14, %c32 : index + %17 = arith.muli %12, %c32 : index + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space> + %18 = gpu.thread_id x + %19 = gpu.thread_id y + %20 = arith.cmpi slt, %16, %18 : index + %21 = arith.select %20, %16, %18 : index + %22 = arith.addi %18, %c1 : index + %23 = arith.cmpi slt, %16, %22 : index + %24 = arith.select %23, %16, %22 : index + %25 = arith.subi %24, %21 : index + %26 = arith.cmpi ugt, %25, %c0 : index + %27 = scf.if %26 -> (f16) { + %34 = arith.muli %19, %c2 : index + %35 = arith.addi %17, %21 : index + %36 = memref.load %arg0[%34, %35] : memref<4x1000xf16> + scf.yield %36 : f16 + } else { + scf.yield %cst : f16 + } + %28 = arith.extf %27 : f16 to f32 + %29 = arith.addf %28, %cst_0 : f32 + %30 = scf.if %26 -> (f16) { + %34 = arith.muli %19, %c2 : index + %35 = arith.addi %34, %c1 : index + %36 = arith.addi %17, %21 : index + %37 = memref.load %arg0[%35, %36] : memref<4x1000xf16> + scf.yield %37 : f16 + } else { + scf.yield %cst : f16 + } + %31 = arith.extf %30 : f16 to f32 + %32 = arith.addf %29, %31 : f32 + memref.store %32, %alloca_1[%19, %18] : memref<2x32xf32, #gpu.address_space> + gpu.barrier + %33 = arith.cmpi ult, %19, %c1 : index + scf.if %33 { + %34 = memref.load %alloca_1[%c0, %18] : memref<2x32xf32, #gpu.address_space> + %35 = arith.addf %34, %cst_0 : f32 + %36 = memref.load %alloca_1[%c1, %18] : memref<2x32xf32, #gpu.address_space> + %37 = arith.addf %36, %35 : f32 + memref.store %37, %alloca[%18] : memref<32xf32, #gpu.address_space> + } + gpu.barrier + %subview = memref.subview %alloca[0] [%16] [1] : memref<32xf32, #gpu.address_space> to memref, #gpu.address_space> + %subview_2 = memref.subview %arg1[%17] [%16] [1] : memref<1000xf32> to memref> + memref.copy %subview, %subview_2 : memref, #gpu.address_space> to memref> + gpu.return + } + } + func.func private @Unknown0(%arg0: memref<4x3x224x224xf32>) -> memref<4x3x224x224xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 588 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c588 = arith.constant 588 : index %c1 = arith.constant 1 : index - %c4704 = arith.constant 4704 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x3x224x224xf16> - gpu.launch_func @unified::@Unknown0 blocks in (%c4704, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x3x224x224xf32>, %alloc : memref<4x3x224x224xf16>) + gpu.launch_func @unified::@Unknown0 blocks in (%c588, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x3x224x224xf32>, %alloc : memref<4x3x224x224xf16>) return %alloc : memref<4x3x224x224xf16> } - func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c10 = arith.constant 10 : index %c1 = arith.constant 1 : index - %c74 = arith.constant 74 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<64x3x7x7xf16> - gpu.launch_func @unified::@Unknown1 blocks in (%c74, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x3x7x7xf32>, %alloc : memref<64x3x7x7xf16>) + gpu.launch_func @unified::@Unknown1 blocks in (%c10, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<64x3x7x7xf32>, %alloc : memref<64x3x7x7xf16>) return %alloc : memref<64x3x7x7xf16> } - func.func private @Unknown3(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - gpu.launch_func @unified::@Unknown3 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>) - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - gpu.launch_func @unified::@Unknown4 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>) - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown5(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown5", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown3(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c36 = arith.constant 36 : index %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf16> - gpu.launch_func @unified::@Unknown5 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>) - return %alloc : memref<64x64x3x3xf16> - } - func.func private @Unknown6(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<64x64x3x3xf16> - gpu.launch_func @unified::@Unknown6 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>) + gpu.launch_func @unified::@Unknown3 blocks in (%c36, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>) return %alloc : memref<64x64x3x3xf16> } - func.func private @Unknown7(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown7(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c8 = arith.constant 8 : index %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<128x64x1x1xf16> - gpu.launch_func @unified::@Unknown7 blocks in (%c64, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x1x1xf32>, %alloc : memref<128x64x1x1xf16>) + gpu.launch_func @unified::@Unknown7 blocks in (%c8, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x64x1x1xf32>, %alloc : memref<128x64x1x1xf16>) return %alloc : memref<128x64x1x1xf16> } - func.func private @Unknown8(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown8(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c72 = arith.constant 72 : index %c1 = arith.constant 1 : index - %c576 = arith.constant 576 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<128x64x3x3xf16> - gpu.launch_func @unified::@Unknown8 blocks in (%c576, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x3x3xf32>, %alloc : memref<128x64x3x3xf16>) + gpu.launch_func @unified::@Unknown8 blocks in (%c72, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x64x3x3xf32>, %alloc : memref<128x64x3x3xf16>) return %alloc : memref<128x64x3x3xf16> } - func.func private @Unknown9(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index - %alloc = memref.alloc() : memref<128x128x3x3xf16> - gpu.launch_func @unified::@Unknown9 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>) - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown10(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown10", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown9(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c144 = arith.constant 144 : index %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index - %alloc = memref.alloc() : memref<128x128x3x3xf16> - gpu.launch_func @unified::@Unknown10 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>) - return %alloc : memref<128x128x3x3xf16> - } - func.func private @Unknown11(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown11", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<128x128x3x3xf16> - gpu.launch_func @unified::@Unknown11 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>) + gpu.launch_func @unified::@Unknown9 blocks in (%c144, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>) return %alloc : memref<128x128x3x3xf16> } - func.func private @Unknown12(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown12(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c32 = arith.constant 32 : index %c1 = arith.constant 1 : index %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x128x1x1xf16> - gpu.launch_func @unified::@Unknown12 blocks in (%c256, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x1x1xf32>, %alloc : memref<256x128x1x1xf16>) + gpu.launch_func @unified::@Unknown12 blocks in (%c32, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x128x1x1xf32>, %alloc : memref<256x128x1x1xf16>) return %alloc : memref<256x128x1x1xf16> } - func.func private @Unknown13(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown13(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c288 = arith.constant 288 : index %c1 = arith.constant 1 : index - %c2304 = arith.constant 2304 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x128x3x3xf16> - gpu.launch_func @unified::@Unknown13 blocks in (%c2304, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x3x3xf32>, %alloc : memref<256x128x3x3xf16>) + gpu.launch_func @unified::@Unknown13 blocks in (%c288, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x128x3x3xf32>, %alloc : memref<256x128x3x3xf16>) return %alloc : memref<256x128x3x3xf16> } - func.func private @Unknown14(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown14", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index - %alloc = memref.alloc() : memref<256x256x3x3xf16> - gpu.launch_func @unified::@Unknown14 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>) - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown15(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown15", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index - %alloc = memref.alloc() : memref<256x256x3x3xf16> - gpu.launch_func @unified::@Unknown15 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>) - return %alloc : memref<256x256x3x3xf16> - } - func.func private @Unknown16(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown14(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown14", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c576 = arith.constant 576 : index %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x256x3x3xf16> - gpu.launch_func @unified::@Unknown16 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>) + gpu.launch_func @unified::@Unknown14 blocks in (%c576, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>) return %alloc : memref<256x256x3x3xf16> } - func.func private @Unknown17(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown17", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + func.func private @Unknown17(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown17", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index - %c1024 = arith.constant 1024 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf16> - gpu.launch_func @unified::@Unknown17 blocks in (%c1024, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x1x1xf32>, %alloc : memref<512x256x1x1xf16>) + gpu.launch_func @unified::@Unknown17 blocks in (%c128, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x256x1x1xf32>, %alloc : memref<512x256x1x1xf16>) return %alloc : memref<512x256x1x1xf16> } - func.func private @Unknown18(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown18(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c1152 = arith.constant 1152 : index %c1 = arith.constant 1 : index - %c9216 = arith.constant 9216 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x3x3xf16> - gpu.launch_func @unified::@Unknown18 blocks in (%c9216, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x3x3xf32>, %alloc : memref<512x256x3x3xf16>) + gpu.launch_func @unified::@Unknown18 blocks in (%c1152, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x256x3x3xf32>, %alloc : memref<512x256x3x3xf16>) return %alloc : memref<512x256x3x3xf16> } - func.func private @Unknown19(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index - %alloc = memref.alloc() : memref<512x512x3x3xf16> - gpu.launch_func @unified::@Unknown19 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>) - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown20(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index - %alloc = memref.alloc() : memref<512x512x3x3xf16> - gpu.launch_func @unified::@Unknown20 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>) - return %alloc : memref<512x512x3x3xf16> - } - func.func private @Unknown21(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown19(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c2304 = arith.constant 2304 : index %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x512x3x3xf16> - gpu.launch_func @unified::@Unknown21 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>) + gpu.launch_func @unified::@Unknown19 blocks in (%c2304, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>) return %alloc : memref<512x512x3x3xf16> } - func.func private @Unknown22(%arg0: memref<4x1000xf32>) -> memref<4x1000xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown22", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown22(%arg0: memref<4x1000xf32>) -> memref<4x1000xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown22", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x1000xf16> - gpu.launch_func @unified::@Unknown22 blocks in (%c32, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x1000xf32>, %alloc : memref<4x1000xf16>) + gpu.launch_func @unified::@Unknown22 blocks in (%c4, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x1000xf32>, %alloc : memref<4x1000xf16>) return %alloc : memref<4x1000xf16> } - func.func private @Unknown23(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown23(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c500 = arith.constant 500 : index %c1 = arith.constant 1 : index - %c4000 = arith.constant 4000 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1000x512xf16> - gpu.launch_func @unified::@Unknown23 blocks in (%c4000, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000x512xf32>, %alloc : memref<1000x512xf16>) + gpu.launch_func @unified::@Unknown23 blocks in (%c500, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1000x512xf32>, %alloc : memref<1000x512xf16>) return %alloc : memref<1000x512xf16> } - func.func private @Unknown24(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 25088 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + func.func private @Unknown24(%arg0: memref<1000xf32>) -> memref<1000xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<1000xf16> + gpu.launch_func @unified::@Unknown24 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1000xf32>, %alloc : memref<1000xf16>) + return %alloc : memref<1000xf16> + } + func.func private @Unknown25(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c4 = arith.constant 4 : index + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c-1 = arith.constant -1 : index + %c512 = arith.constant 512 : index + %c2 = arith.constant 2 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<4xf16> + gpu.launch_func @unified::@Unknown25_kernel blocks in (%c4, %c1, %c1) threads in (%c512, %c1, %c1) args(%arg0 : memref<4x1000xf16>, %alloc : memref<4xf16>) + return %alloc : memref<4xf16> + } + func.func private @Unknown26(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c3136 = arith.constant 3136 : index %c1 = arith.constant 1 : index - %c25088 = arith.constant 25088 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x64x112x112xi1> %alloc_0 = memref.alloc() : memref<4x64x112x112xf16> - gpu.launch_func @unified::@Unknown24 blocks in (%c25088, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x112x112xf16>, %alloc_0 : memref<4x64x112x112xf16>, %alloc : memref<4x64x112x112xi1>) + gpu.launch_func @unified::@Unknown26 blocks in (%c3136, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x64x112x112xf16>, %alloc_0 : memref<4x64x112x112xf16>, %alloc : memref<4x64x112x112xi1>) return %alloc_0, %alloc : memref<4x64x112x112xf16>, memref<4x64x112x112xi1> } - func.func private @Unknown26(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index - %alloc = memref.alloc() : memref<4x64x56x56xi1> - %alloc_0 = memref.alloc() : memref<4x64x56x56xf16> - gpu.launch_func @unified::@Unknown26 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>) - return %alloc_0, %alloc : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> - } - func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown28", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index - %alloc = memref.alloc() : memref<4x64x56x56xi1> - %alloc_0 = memref.alloc() : memref<4x64x56x56xf16> - gpu.launch_func @unified::@Unknown28 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>) - return %alloc_0, %alloc : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> - } - func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown28", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c784 = arith.constant 784 : index %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x64x56x56xi1> %alloc_0 = memref.alloc() : memref<4x64x56x56xf16> - gpu.launch_func @unified::@Unknown30 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>) + gpu.launch_func @unified::@Unknown28 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>) return %alloc_0, %alloc : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> } - func.func private @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c784 = arith.constant 784 : index %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x64x56x56xi1> %alloc_0 = memref.alloc() : memref<4x64x56x56xf16> - gpu.launch_func @unified::@Unknown32 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>) + gpu.launch_func @unified::@Unknown30 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>) return %alloc_0, %alloc : memref<4x64x56x56xf16>, memref<4x64x56x56xi1> } - func.func private @Unknown35(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c3136 = arith.constant 3136 : index - %alloc = memref.alloc() : memref<4x128x28x28xi1> - %alloc_0 = memref.alloc() : memref<4x128x28x28xf16> - gpu.launch_func @unified::@Unknown35 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>) - return %alloc_0, %alloc : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> - } - func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c3136 = arith.constant 3136 : index - %alloc = memref.alloc() : memref<4x128x28x28xi1> - %alloc_0 = memref.alloc() : memref<4x128x28x28xf16> - gpu.launch_func @unified::@Unknown37 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>) - return %alloc_0, %alloc : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> - } - func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown39", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c392 = arith.constant 392 : index %c1 = arith.constant 1 : index - %c3136 = arith.constant 3136 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x128x28x28xi1> %alloc_0 = memref.alloc() : memref<4x128x28x28xf16> - gpu.launch_func @unified::@Unknown39 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>) + gpu.launch_func @unified::@Unknown37 blocks in (%c392, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>) return %alloc_0, %alloc : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> } - func.func private @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown41", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown39", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c392 = arith.constant 392 : index %c1 = arith.constant 1 : index - %c3136 = arith.constant 3136 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x128x28x28xi1> %alloc_0 = memref.alloc() : memref<4x128x28x28xf16> - gpu.launch_func @unified::@Unknown41 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>) + gpu.launch_func @unified::@Unknown39 blocks in (%c392, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>) return %alloc_0, %alloc : memref<4x128x28x28xf16>, memref<4x128x28x28xi1> } - func.func private @Unknown44(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index - %alloc = memref.alloc() : memref<4x256x14x14xi1> - %alloc_0 = memref.alloc() : memref<4x256x14x14xf16> - gpu.launch_func @unified::@Unknown44 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>) - return %alloc_0, %alloc : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> - } - func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index - %alloc = memref.alloc() : memref<4x256x14x14xi1> - %alloc_0 = memref.alloc() : memref<4x256x14x14xf16> - gpu.launch_func @unified::@Unknown46 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>) - return %alloc_0, %alloc : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> - } - func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c196 = arith.constant 196 : index %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x256x14x14xi1> %alloc_0 = memref.alloc() : memref<4x256x14x14xf16> - gpu.launch_func @unified::@Unknown48 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>) + gpu.launch_func @unified::@Unknown46 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>) return %alloc_0, %alloc : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> } - func.func private @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown50", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c196 = arith.constant 196 : index %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x256x14x14xi1> %alloc_0 = memref.alloc() : memref<4x256x14x14xf16> - gpu.launch_func @unified::@Unknown50 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>) + gpu.launch_func @unified::@Unknown48 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>) return %alloc_0, %alloc : memref<4x256x14x14xf16>, memref<4x256x14x14xi1> } - func.func private @Unknown53(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown53", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index - %alloc = memref.alloc() : memref<4x512x7x7xi1> - %alloc_0 = memref.alloc() : memref<4x512x7x7xf16> - gpu.launch_func @unified::@Unknown53 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>) - return %alloc_0, %alloc : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> - } - func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c98 = arith.constant 98 : index %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x512x7x7xi1> %alloc_0 = memref.alloc() : memref<4x512x7x7xf16> - gpu.launch_func @unified::@Unknown55 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %arg1 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>) + gpu.launch_func @unified::@Unknown55 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>) return %alloc_0, %alloc : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> } - func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c98 = arith.constant 98 : index %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x512x7x7xi1> %alloc_0 = memref.alloc() : memref<4x512x7x7xf16> - gpu.launch_func @unified::@Unknown57 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>) + gpu.launch_func @unified::@Unknown57 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %arg1 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>) return %alloc_0, %alloc : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> } - func.func private @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown62(%arg0: memref<4x512x7x7xf16>) -> memref<4x512xf16> attributes {__byteir_reduction_fusion__} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c2048 = arith.constant 2048 : index + %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index - %alloc = memref.alloc() : memref<4x512x7x7xi1> - %alloc_0 = memref.alloc() : memref<4x512x7x7xf16> - gpu.launch_func @unified::@Unknown59 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %arg1 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>) - return %alloc_0, %alloc : memref<4x512x7x7xf16>, memref<4x512x7x7xi1> + %c49 = arith.constant 49 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<4x512x7x7xf16> into memref<2048x49xf16> + %alloc = memref.alloc() : memref<2048xf16> + gpu.launch_func @unified::@Unknown62_kernel blocks in (%c2048, %c1, %c1) threads in (%c64, %c1, %c1) args(%collapse_shape : memref<2048x49xf16>, %alloc : memref<2048xf16>) + %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<2048xf16> into memref<4x512xf16> + return %expand_shape : memref<4x512xf16> } - func.func private @Unknown60(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 16 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown63(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index - %c16 = arith.constant 16 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x512xf16> - gpu.launch_func @unified::@Unknown60 blocks in (%c16, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512xf16>, %alloc : memref<4x512xf16>) + gpu.launch_func @unified::@Unknown63 blocks in (%c2, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x512xf16>, %alloc : memref<4x512xf16>) return %alloc : memref<4x512xf16> } - func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x1000xf16> - gpu.launch_func @unified::@Unknown61 blocks in (%c32, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000xf32>, %arg1 : memref<4x1000xf16>, %alloc : memref<4x1000xf16>) + gpu.launch_func @unified::@Unknown64 blocks in (%c4, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1000xf16>, %arg1 : memref<4x1000xf16>, %alloc : memref<4x1000xf16>) return %alloc : memref<4x1000xf16> } - func.func private @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + func.func private @Unknown65(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c4 = arith.constant 4 : index + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c-1 = arith.constant -1 : index + %c512 = arith.constant 512 : index + %c2 = arith.constant 2 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<4xf16> + gpu.launch_func @unified::@Unknown65_kernel blocks in (%c4, %c1, %c1) threads in (%c512, %c1, %c1) args(%arg0 : memref<4x1000xf16>, %alloc : memref<4xf16>) + return %alloc : memref<4xf16> + } + func.func private @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown66", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x1000xf16> - %alloc_0 = memref.alloc() : memref<4x1000xf16> - gpu.launch_func @unified::@Unknown62 blocks in (%c32, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4xf16>, %arg1 : memref<4x1000xf16>, %alloc_0 : memref<4x1000xf16>, %alloc : memref<4x1000xf16>) - return %alloc_0, %alloc : memref<4x1000xf16>, memref<4x1000xf16> + gpu.launch_func @unified::@Unknown66 blocks in (%c4, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4xf16>, %arg1 : memref<4x1000xf16>, %alloc : memref<4x1000xf16>) + return %alloc : memref<4x1000xf16> } - func.func private @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32, 4 : i32, 5 : i32, 6 : i32, 7 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + func.func private @Unknown67(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} { + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c4 = arith.constant 4 : index + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c-1 = arith.constant -1 : index + %c512 = arith.constant 512 : index + %c2 = arith.constant 2 : index + %cst = arith.constant 0.000000e+00 : f16 %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %alloc = memref.alloc() : memref<4x1000xf32> - %alloc_0 = memref.alloc() : memref<4x1000xf32> - %alloc_1 = memref.alloc() : memref<4x1000xf16> - gpu.launch_func @unified::@Unknown63 blocks in (%c32, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4xf16>, %arg1 : memref<4x1000xf16>, %arg2 : memref<4xf16>, %arg3 : memref<4x1000xf16>, %arg4 : memref<4x1000xf32>, %alloc_1 : memref<4x1000xf16>, %alloc_0 : memref<4x1000xf32>, %alloc : memref<4x1000xf32>) - return %alloc_1, %alloc_0, %alloc : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32> + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<4xf16> + gpu.launch_func @unified::@Unknown67_kernel blocks in (%c4, %c1, %c1) threads in (%c512, %c1, %c1) args(%arg0 : memref<4x1000xf16>, %alloc : memref<4xf16>) + return %alloc : memref<4xf16> } - func.func private @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown68(%arg0: memref<4xf16>) -> memref<4xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index - %alloc = memref.alloc() : memref<4x512x7x7xf16> - gpu.launch_func @unified::@Unknown64 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512xf16>, %arg1 : memref<4x512x7x7xi1>, %alloc : memref<4x512x7x7xf16>) - return %alloc : memref<4x512x7x7xf16> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<4xf16> + gpu.launch_func @unified::@Unknown68 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4xf16>, %alloc : memref<4xf16>) + return %alloc : memref<4xf16> } - func.func private @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32, 4 : i32, 5 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index - %alloc = memref.alloc() : memref<4x512x7x7xf16> - gpu.launch_func @unified::@Unknown68 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xi1>, %arg1 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xf16>) - return %alloc : memref<4x512x7x7xf16> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<4x1000xf16> + %alloc_0 = memref.alloc() : memref<4x1000xf16> + gpu.launch_func @unified::@Unknown69 blocks in (%c4, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4xf16>, %arg1 : memref<4x1000xf16>, %arg2 : memref<4xf16>, %arg3 : memref<4x1000xf16>, %alloc_0 : memref<4x1000xf16>, %alloc : memref<4x1000xf16>) + return %alloc_0, %alloc : memref<4x1000xf16>, memref<4x1000xf16> } - func.func private @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown70", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c98 = arith.constant 98 : index %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> - gpu.launch_func @unified::@Unknown72 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %arg1 : memref<4x512x7x7xf16>, %arg2 : memref<4x512x7x7xi1>, %alloc : memref<4x512x7x7xf16>) + gpu.launch_func @unified::@Unknown70 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x512xf16>, %arg1 : memref<4x512x7x7xi1>, %alloc : memref<4x512x7x7xf16>) return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown76", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c98 = arith.constant 98 : index %c1 = arith.constant 1 : index - %c784 = arith.constant 784 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x512x7x7xf16> - gpu.launch_func @unified::@Unknown76 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xi1>, %arg1 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xf16>) + gpu.launch_func @unified::@Unknown74 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x512x7x7xi1>, %arg1 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xf16>) return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index - %alloc = memref.alloc() : memref<4x256x14x14xf16> - gpu.launch_func @unified::@Unknown83 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %arg2 : memref<4x256x14x14xi1>, %alloc : memref<4x256x14x14xf16>) - return %alloc : memref<4x256x14x14xf16> - } - func.func private @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c98 = arith.constant 98 : index %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index - %alloc = memref.alloc() : memref<4x256x14x14xf16> - gpu.launch_func @unified::@Unknown87 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xi1>, %arg1 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xf16>) - return %alloc : memref<4x256x14x14xf16> + %c256 = arith.constant 256 : index + %alloc = memref.alloc() : memref<4x512x7x7xf16> + gpu.launch_func @unified::@Unknown78 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %arg1 : memref<4x512x7x7xf16>, %arg2 : memref<4x512x7x7xi1>, %alloc : memref<4x512x7x7xf16>) + return %alloc : memref<4x512x7x7xf16> } - func.func private @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c196 = arith.constant 196 : index %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> - gpu.launch_func @unified::@Unknown91 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %arg2 : memref<4x256x14x14xi1>, %alloc : memref<4x256x14x14xf16>) + gpu.launch_func @unified::@Unknown89 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %arg2 : memref<4x256x14x14xi1>, %alloc : memref<4x256x14x14xf16>) return %alloc : memref<4x256x14x14xf16> } - func.func private @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c196 = arith.constant 196 : index %c1 = arith.constant 1 : index - %c1568 = arith.constant 1568 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x256x14x14xf16> - gpu.launch_func @unified::@Unknown95 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xi1>, %arg1 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xf16>) + gpu.launch_func @unified::@Unknown93 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x256x14x14xi1>, %arg1 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xf16>) return %alloc : memref<4x256x14x14xf16> } - func.func private @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown102", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c3136 = arith.constant 3136 : index - %alloc = memref.alloc() : memref<4x128x28x28xf16> - gpu.launch_func @unified::@Unknown102 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %arg2 : memref<4x128x28x28xi1>, %alloc : memref<4x128x28x28xf16>) - return %alloc : memref<4x128x28x28xf16> - } - func.func private @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown106", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c3136 = arith.constant 3136 : index - %alloc = memref.alloc() : memref<4x128x28x28xf16> - gpu.launch_func @unified::@Unknown106 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xi1>, %arg1 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xf16>) - return %alloc : memref<4x128x28x28xf16> - } - func.func private @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown110", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown108", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c392 = arith.constant 392 : index %c1 = arith.constant 1 : index - %c3136 = arith.constant 3136 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> - gpu.launch_func @unified::@Unknown110 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %arg2 : memref<4x128x28x28xi1>, %alloc : memref<4x128x28x28xf16>) + gpu.launch_func @unified::@Unknown108 blocks in (%c392, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %arg2 : memref<4x128x28x28xi1>, %alloc : memref<4x128x28x28xf16>) return %alloc : memref<4x128x28x28xf16> } - func.func private @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown114", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown112", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c392 = arith.constant 392 : index %c1 = arith.constant 1 : index - %c3136 = arith.constant 3136 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x128x28x28xf16> - gpu.launch_func @unified::@Unknown114 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xi1>, %arg1 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xf16>) + gpu.launch_func @unified::@Unknown112 blocks in (%c392, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x128x28x28xi1>, %arg1 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xf16>) return %alloc : memref<4x128x28x28xf16> } - func.func private @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown121", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index - %alloc = memref.alloc() : memref<4x64x56x56xf16> - gpu.launch_func @unified::@Unknown121 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %arg2 : memref<4x64x56x56xi1>, %alloc : memref<4x64x56x56xf16>) - return %alloc : memref<4x64x56x56xf16> - } - func.func private @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown125", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index - %alloc = memref.alloc() : memref<4x64x56x56xf16> - gpu.launch_func @unified::@Unknown125 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xi1>, %arg1 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xf16>) - return %alloc : memref<4x64x56x56xf16> - } - func.func private @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown129", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown127", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c784 = arith.constant 784 : index %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> - gpu.launch_func @unified::@Unknown129 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %arg2 : memref<4x64x56x56xi1>, %alloc : memref<4x64x56x56xf16>) + gpu.launch_func @unified::@Unknown127 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %arg2 : memref<4x64x56x56xi1>, %alloc : memref<4x64x56x56xf16>) return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown133", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown131", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c784 = arith.constant 784 : index %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> - gpu.launch_func @unified::@Unknown133 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xi1>, %arg1 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xf16>) + gpu.launch_func @unified::@Unknown131 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x64x56x56xi1>, %arg1 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xf16>) return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown137", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown143", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c784 = arith.constant 784 : index %c1 = arith.constant 1 : index - %c6272 = arith.constant 6272 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x64x56x56xf16> - gpu.launch_func @unified::@Unknown137 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xf16>) + gpu.launch_func @unified::@Unknown143 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xf16>) return %alloc : memref<4x64x56x56xf16> } - func.func private @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 25088 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown138", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown144", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c3136 = arith.constant 3136 : index %c1 = arith.constant 1 : index - %c25088 = arith.constant 25088 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<4x64x112x112xf16> - gpu.launch_func @unified::@Unknown138 blocks in (%c25088, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x112x112xi1>, %arg1 : memref<4x64x112x112xf16>, %alloc : memref<4x64x112x112xf16>) + gpu.launch_func @unified::@Unknown144 blocks in (%c3136, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<4x64x112x112xi1>, %arg1 : memref<4x64x112x112xf16>, %alloc : memref<4x64x112x112xf16>) return %alloc : memref<4x64x112x112xf16> } - func.func private @Unknown141(%arg0: memref) -> memref attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [0 : i32, 0 : i32], __byre__kernel_name = "Unknown141", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + func.func private @Unknown147(%arg0: memref<4x1000xf16>, %arg1: memref<4x1000xf32>) -> memref attributes {__byteir_reduction_fusion__} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c32 = arith.constant 32 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c125 = arith.constant 125 : index %c128 = arith.constant 128 : index + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref + %collapse_shape = memref.collapse_shape %arg0 [[0, 1]] : memref<4x1000xf16> into memref<4000xf16> + %collapse_shape_1 = memref.collapse_shape %arg1 [[0, 1]] : memref<4x1000xf32> into memref<4000xf32> + %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<4000xf16> into memref<32x125xf16> + %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] : memref<4000xf32> into memref<32x125xf32> + %alloc_3 = memref.alloc() : memref<32xf32> + gpu.launch_func @unified::@Unknown147_kernel blocks in (%c32, %c1, %c1) threads in (%c128, %c1, %c1) args(%expand_shape : memref<32x125xf16>, %expand_shape_2 : memref<32x125xf32>, %alloc_3 : memref<32xf32>) + gpu.launch_func @unified::@Unknown147_kernel_0 blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1) args(%alloc_3 : memref<32xf32>, %alloc : memref) + return %alloc : memref + } + func.func private @Unknown148(%arg0: memref) -> memref attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [0 : i32, 0 : i32], __byre__kernel_name = "Unknown148", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref - gpu.launch_func @unified::@Unknown141 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref, %alloc : memref) + gpu.launch_func @unified::@Unknown148 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref, %alloc : memref) return %alloc : memref } - func.func private @Unknown142(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown142", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown149(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown149", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c10 = arith.constant 10 : index %c1 = arith.constant 1 : index - %c74 = arith.constant 74 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<64x3x7x7xf32> - gpu.launch_func @unified::@Unknown142 blocks in (%c74, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x3x7x7xf16>, %alloc : memref<64x3x7x7xf32>) + gpu.launch_func @unified::@Unknown149 blocks in (%c10, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<64x3x7x7xf16>, %alloc : memref<64x3x7x7xf32>) return %alloc : memref<64x3x7x7xf32> } - func.func private @Unknown143(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown143", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - gpu.launch_func @unified::@Unknown143 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>) - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown144(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown144", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - gpu.launch_func @unified::@Unknown144 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>) - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown145(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown145", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index - %alloc = memref.alloc() : memref<64x64x3x3xf32> - gpu.launch_func @unified::@Unknown145 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>) - return %alloc : memref<64x64x3x3xf32> - } - func.func private @Unknown146(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown146", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown150(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown150", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c36 = arith.constant 36 : index %c1 = arith.constant 1 : index - %c288 = arith.constant 288 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<64x64x3x3xf32> - gpu.launch_func @unified::@Unknown146 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>) + gpu.launch_func @unified::@Unknown150 blocks in (%c36, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>) return %alloc : memref<64x64x3x3xf32> } - func.func private @Unknown147(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown147", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown154(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown154", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c72 = arith.constant 72 : index %c1 = arith.constant 1 : index - %c576 = arith.constant 576 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<128x64x3x3xf32> - gpu.launch_func @unified::@Unknown147 blocks in (%c576, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x3x3xf16>, %alloc : memref<128x64x3x3xf32>) + gpu.launch_func @unified::@Unknown154 blocks in (%c72, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x64x3x3xf16>, %alloc : memref<128x64x3x3xf32>) return %alloc : memref<128x64x3x3xf32> } - func.func private @Unknown148(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown148", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown155(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown155", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c144 = arith.constant 144 : index %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<128x128x3x3xf32> - gpu.launch_func @unified::@Unknown148 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>) + gpu.launch_func @unified::@Unknown155 blocks in (%c144, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>) return %alloc : memref<128x128x3x3xf32> } - func.func private @Unknown149(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown149", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown156(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown156", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c8 = arith.constant 8 : index %c1 = arith.constant 1 : index - %c64 = arith.constant 64 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<128x64x1x1xf32> - gpu.launch_func @unified::@Unknown149 blocks in (%c64, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x1x1xf16>, %alloc : memref<128x64x1x1xf32>) + gpu.launch_func @unified::@Unknown156 blocks in (%c8, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<128x64x1x1xf16>, %alloc : memref<128x64x1x1xf32>) return %alloc : memref<128x64x1x1xf32> } - func.func private @Unknown150(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown150", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index - %alloc = memref.alloc() : memref<128x128x3x3xf32> - gpu.launch_func @unified::@Unknown150 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>) - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown151(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown151", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c1152 = arith.constant 1152 : index - %alloc = memref.alloc() : memref<128x128x3x3xf32> - gpu.launch_func @unified::@Unknown151 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>) - return %alloc : memref<128x128x3x3xf32> - } - func.func private @Unknown152(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown152", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown159(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown159", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c288 = arith.constant 288 : index %c1 = arith.constant 1 : index - %c2304 = arith.constant 2304 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x128x3x3xf32> - gpu.launch_func @unified::@Unknown152 blocks in (%c2304, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x3x3xf16>, %alloc : memref<256x128x3x3xf32>) + gpu.launch_func @unified::@Unknown159 blocks in (%c288, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x128x3x3xf16>, %alloc : memref<256x128x3x3xf32>) return %alloc : memref<256x128x3x3xf32> } - func.func private @Unknown153(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown153", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown160(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown160", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c576 = arith.constant 576 : index %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x256x3x3xf32> - gpu.launch_func @unified::@Unknown153 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>) + gpu.launch_func @unified::@Unknown160 blocks in (%c576, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>) return %alloc : memref<256x256x3x3xf32> } - func.func private @Unknown154(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown154", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown161(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown161", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c32 = arith.constant 32 : index %c1 = arith.constant 1 : index %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<256x128x1x1xf32> - gpu.launch_func @unified::@Unknown154 blocks in (%c256, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x1x1xf16>, %alloc : memref<256x128x1x1xf32>) + gpu.launch_func @unified::@Unknown161 blocks in (%c32, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<256x128x1x1xf16>, %alloc : memref<256x128x1x1xf32>) return %alloc : memref<256x128x1x1xf32> } - func.func private @Unknown155(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown155", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index - %alloc = memref.alloc() : memref<256x256x3x3xf32> - gpu.launch_func @unified::@Unknown155 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>) - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown156(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown156", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c4608 = arith.constant 4608 : index - %alloc = memref.alloc() : memref<256x256x3x3xf32> - gpu.launch_func @unified::@Unknown156 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>) - return %alloc : memref<256x256x3x3xf32> - } - func.func private @Unknown157(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown157", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown164(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown164", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c1152 = arith.constant 1152 : index %c1 = arith.constant 1 : index - %c9216 = arith.constant 9216 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x3x3xf32> - gpu.launch_func @unified::@Unknown157 blocks in (%c9216, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x3x3xf16>, %alloc : memref<512x256x3x3xf32>) + gpu.launch_func @unified::@Unknown164 blocks in (%c1152, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x256x3x3xf16>, %alloc : memref<512x256x3x3xf32>) return %alloc : memref<512x256x3x3xf32> } - func.func private @Unknown158(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown158", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown165(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown165", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c2304 = arith.constant 2304 : index %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x512x3x3xf32> - gpu.launch_func @unified::@Unknown158 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>) + gpu.launch_func @unified::@Unknown165 blocks in (%c2304, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>) return %alloc : memref<512x512x3x3xf32> } - func.func private @Unknown159(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown159", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + func.func private @Unknown166(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown166", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index - %c1024 = arith.constant 1024 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<512x256x1x1xf32> - gpu.launch_func @unified::@Unknown159 blocks in (%c1024, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x1x1xf16>, %alloc : memref<512x256x1x1xf32>) + gpu.launch_func @unified::@Unknown166 blocks in (%c128, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<512x256x1x1xf16>, %alloc : memref<512x256x1x1xf32>) return %alloc : memref<512x256x1x1xf32> } - func.func private @Unknown160(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown160", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index - %alloc = memref.alloc() : memref<512x512x3x3xf32> - gpu.launch_func @unified::@Unknown160 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>) - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown161(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown161", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c18432 = arith.constant 18432 : index - %alloc = memref.alloc() : memref<512x512x3x3xf32> - gpu.launch_func @unified::@Unknown161 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>) - return %alloc : memref<512x512x3x3xf32> - } - func.func private @Unknown163(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown163", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown170(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown170", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c500 = arith.constant 500 : index %c1 = arith.constant 1 : index - %c4000 = arith.constant 4000 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1000x512xf32> - gpu.launch_func @unified::@Unknown163 blocks in (%c4000, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000x512xf16>, %alloc : memref<1000x512xf32>) + gpu.launch_func @unified::@Unknown170 blocks in (%c500, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1000x512xf16>, %alloc : memref<1000x512xf32>) return %alloc : memref<1000x512xf32> } - func.func private @Unknown164(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown164", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { - %c128 = arith.constant 128 : index + func.func private @Unknown171(%arg0: memref<4x1000xf16>) -> memref<1000xf32> attributes {__byteir_reduction_fusion__} { + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %c1000 = arith.constant 1000 : index + %c-32 = arith.constant -32 : index + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<1000xf32> + gpu.launch_func @unified::@Unknown171_kernel blocks in (%c32, %c1, %c1) threads in (%c32, %c2, %c1) args(%arg0 : memref<4x1000xf16>, %alloc : memref<1000xf32>) + return %alloc : memref<1000xf32> + } + func.func private @Unknown172(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown172", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} { + %c1 = arith.constant 1 : index + %c256 = arith.constant 256 : index %alloc = memref.alloc() : memref<1000xf32> - gpu.launch_func @unified::@Unknown164 blocks in (%c8, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000xf32>, %alloc : memref<1000xf32>) + gpu.launch_func @unified::@Unknown172 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1) args(%arg0 : memref<1000xf32>, %alloc : memref<1000xf32>) return %alloc : memref<1000xf32> } func.func @main(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x1000xf32>, %arg2: memref<64x3x7x7xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64x64x3x3xf32>, %arg8: memref<64xf32>, %arg9: memref<64xf32>, %arg10: memref<64xf32>, %arg11: memref<64xf32>, %arg12: memref<64x64x3x3xf32>, %arg13: memref<64xf32>, %arg14: memref<64xf32>, %arg15: memref<64xf32>, %arg16: memref<64xf32>, %arg17: memref<64x64x3x3xf32>, %arg18: memref<64xf32>, %arg19: memref<64xf32>, %arg20: memref<64xf32>, %arg21: memref<64xf32>, %arg22: memref<64x64x3x3xf32>, %arg23: memref<64xf32>, %arg24: memref<64xf32>, %arg25: memref<64xf32>, %arg26: memref<64xf32>, %arg27: memref<128x64x3x3xf32>, %arg28: memref<128xf32>, %arg29: memref<128xf32>, %arg30: memref<128xf32>, %arg31: memref<128xf32>, %arg32: memref<128x128x3x3xf32>, %arg33: memref<128xf32>, %arg34: memref<128xf32>, %arg35: memref<128xf32>, %arg36: memref<128xf32>, %arg37: memref<128x64x1x1xf32>, %arg38: memref<128xf32>, %arg39: memref<128xf32>, %arg40: memref<128xf32>, %arg41: memref<128xf32>, %arg42: memref<128x128x3x3xf32>, %arg43: memref<128xf32>, %arg44: memref<128xf32>, %arg45: memref<128xf32>, %arg46: memref<128xf32>, %arg47: memref<128x128x3x3xf32>, %arg48: memref<128xf32>, %arg49: memref<128xf32>, %arg50: memref<128xf32>, %arg51: memref<128xf32>, %arg52: memref<256x128x3x3xf32>, %arg53: memref<256xf32>, %arg54: memref<256xf32>, %arg55: memref<256xf32>, %arg56: memref<256xf32>, %arg57: memref<256x256x3x3xf32>, %arg58: memref<256xf32>, %arg59: memref<256xf32>, %arg60: memref<256xf32>, %arg61: memref<256xf32>, %arg62: memref<256x128x1x1xf32>, %arg63: memref<256xf32>, %arg64: memref<256xf32>, %arg65: memref<256xf32>, %arg66: memref<256xf32>, %arg67: memref<256x256x3x3xf32>, %arg68: memref<256xf32>, %arg69: memref<256xf32>, %arg70: memref<256xf32>, %arg71: memref<256xf32>, %arg72: memref<256x256x3x3xf32>, %arg73: memref<256xf32>, %arg74: memref<256xf32>, %arg75: memref<256xf32>, %arg76: memref<256xf32>, %arg77: memref<512x256x3x3xf32>, %arg78: memref<512xf32>, %arg79: memref<512xf32>, %arg80: memref<512xf32>, %arg81: memref<512xf32>, %arg82: memref<512x512x3x3xf32>, %arg83: memref<512xf32>, %arg84: memref<512xf32>, %arg85: memref<512xf32>, %arg86: memref<512xf32>, %arg87: memref<512x256x1x1xf32>, %arg88: memref<512xf32>, %arg89: memref<512xf32>, %arg90: memref<512xf32>, %arg91: memref<512xf32>, %arg92: memref<512x512x3x3xf32>, %arg93: memref<512xf32>, %arg94: memref<512xf32>, %arg95: memref<512xf32>, %arg96: memref<512xf32>, %arg97: memref<512x512x3x3xf32>, %arg98: memref<512xf32>, %arg99: memref<512xf32>, %arg100: memref<512xf32>, %arg101: memref<512xf32>, %arg102: memref<1000x512xf32>, %arg103: memref<1000xf32>) -> (memref, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32>) attributes {__placeholder__byre.entry_point} { @@ -4678,344 +2793,340 @@ module @IrToMhlo.2452 attributes {gpu.container_module} { %alloc_0 = memref.alloc() : memref<4x64x112x112xf16> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc, %arg3, %arg4, %alloc_0) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x112x112xf16> %2 = call @Unknown3(%arg7) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %3 = call @Unknown4(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %4 = call @Unknown5(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> - %5 = call @Unknown6(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %3 = call @Unknown3(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %4 = call @Unknown3(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> + %5 = call @Unknown3(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> %6 = call @Unknown7(%arg37) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> %7 = call @Unknown8(%arg27) : (memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> %8 = call @Unknown9(%arg32) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> - %9 = call @Unknown10(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> - %10 = call @Unknown11(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %9 = call @Unknown9(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> + %10 = call @Unknown9(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> %11 = call @Unknown12(%arg62) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> %12 = call @Unknown13(%arg52) : (memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> %13 = call @Unknown14(%arg57) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> - %14 = call @Unknown15(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> - %15 = call @Unknown16(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %14 = call @Unknown14(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> + %15 = call @Unknown14(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> %16 = call @Unknown17(%arg87) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> %17 = call @Unknown18(%arg77) : (memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> %18 = call @Unknown19(%arg82) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> - %19 = call @Unknown20(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> - %20 = call @Unknown21(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %19 = call @Unknown19(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> + %20 = call @Unknown19(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> %21 = call @Unknown22(%arg1) : (memref<4x1000xf32>) -> memref<4x1000xf16> %22 = call @Unknown23(%arg102) : (memref<1000x512xf32>) -> memref<1000x512xf16> - %alloc_1 = memref.alloc() : memref<4xf16> - byre.compute @ReduceSumOp_f16_f16(%21, %alloc_1) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %23:2 = call @Unknown24(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) + %23 = call @Unknown24(%arg103) : (memref<1000xf32>) -> memref<1000xf16> + %24 = call @Unknown25(%21) : (memref<4x1000xf16>) -> memref<4xf16> + %25:2 = call @Unknown26(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) + %alloc_1 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @PoolMaxOp_f16_f16(%25#0, %alloc_1) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16> %alloc_2 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @PoolMaxOp_f16_f16(%23#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16> + byre.compute @ConvOp_f16f16_f16(%alloc_1, %2, %alloc_2) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_3 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%alloc_2, %2, %alloc_3) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_2, %arg8, %arg9, %alloc_3) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %26:2 = call @Unknown28(%alloc_3) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_4 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_3, %arg8, %arg9, %alloc_4) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %24:2 = call @Unknown26(%alloc_4) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%26#0, %3, %alloc_4) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_5 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%24#0, %3, %alloc_5) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_4, %arg13, %arg14, %alloc_5) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %27:2 = call @Unknown30(%alloc_5, %alloc_1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_6 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_5, %arg13, %arg14, %alloc_6) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %25:2 = call @Unknown28(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%27#0, %4, %alloc_6) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_7 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%25#0, %4, %alloc_7) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_6, %arg18, %arg19, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %28:2 = call @Unknown28(%alloc_7) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) %alloc_8 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_7, %arg18, %arg19, %alloc_8) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %26:2 = call @Unknown30(%alloc_8) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @ConvOp_f16f16_f16(%28#0, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> %alloc_9 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvOp_f16f16_f16(%26#0, %5, %alloc_9) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_10 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_9, %arg23, %arg24, %alloc_10) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> - %27:2 = call @Unknown32(%alloc_10, %25#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_8, %arg23, %arg24, %alloc_9) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16> + %29:2 = call @Unknown30(%alloc_9, %27#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) + %alloc_10 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvOp_f16f16_f16(%29#0, %6, %alloc_10) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16> %alloc_11 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%27#0, %6, %alloc_11) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_10, %arg38, %arg39, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> %alloc_12 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_11, %arg38, %arg39, %alloc_12) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + byre.compute @ConvOp_f16f16_f16(%29#0, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16> %alloc_13 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%27#0, %7, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_12, %arg28, %arg29, %alloc_13) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %30:2 = call @Unknown37(%alloc_13) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_14 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_13, %arg28, %arg29, %alloc_14) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %28:2 = call @Unknown35(%alloc_14) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%30#0, %8, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_15 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%28#0, %8, %alloc_15) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_14, %arg33, %arg34, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %31:2 = call @Unknown39(%alloc_15, %alloc_11) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_16 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_15, %arg33, %arg34, %alloc_16) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %29:2 = call @Unknown37(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%31#0, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_17 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%29#0, %9, %alloc_17) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_16, %arg43, %arg44, %alloc_17) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %32:2 = call @Unknown37(%alloc_17) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) %alloc_18 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_17, %arg43, %arg44, %alloc_18) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %30:2 = call @Unknown39(%alloc_18) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @ConvOp_f16f16_f16(%32#0, %10, %alloc_18) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> %alloc_19 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvOp_f16f16_f16(%30#0, %10, %alloc_19) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_20 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_19, %arg48, %arg49, %alloc_20) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> - %31:2 = call @Unknown41(%alloc_20, %29#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_18, %arg48, %arg49, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16> + %33:2 = call @Unknown39(%alloc_19, %31#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) + %alloc_20 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvOp_f16f16_f16(%33#0, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16> %alloc_21 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%31#0, %11, %alloc_21) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_20, %arg63, %arg64, %alloc_21) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> %alloc_22 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_21, %arg63, %arg64, %alloc_22) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + byre.compute @ConvOp_f16f16_f16(%33#0, %12, %alloc_22) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16> %alloc_23 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%31#0, %12, %alloc_23) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_22, %arg53, %arg54, %alloc_23) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %34:2 = call @Unknown46(%alloc_23) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_24 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_23, %arg53, %arg54, %alloc_24) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %32:2 = call @Unknown44(%alloc_24) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%34#0, %13, %alloc_24) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_25 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%32#0, %13, %alloc_25) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_24, %arg58, %arg59, %alloc_25) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %35:2 = call @Unknown48(%alloc_25, %alloc_21) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_26 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_25, %arg58, %arg59, %alloc_26) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %33:2 = call @Unknown46(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%35#0, %14, %alloc_26) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_27 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%33#0, %14, %alloc_27) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_26, %arg68, %arg69, %alloc_27) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %36:2 = call @Unknown46(%alloc_27) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) %alloc_28 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_27, %arg68, %arg69, %alloc_28) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %34:2 = call @Unknown48(%alloc_28) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @ConvOp_f16f16_f16(%36#0, %15, %alloc_28) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> %alloc_29 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvOp_f16f16_f16(%34#0, %15, %alloc_29) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_30 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_29, %arg73, %arg74, %alloc_30) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> - %35:2 = call @Unknown50(%alloc_30, %33#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_28, %arg73, %arg74, %alloc_29) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16> + %37:2 = call @Unknown48(%alloc_29, %35#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) + %alloc_30 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvOp_f16f16_f16(%37#0, %16, %alloc_30) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16> %alloc_31 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%35#0, %16, %alloc_31) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_30, %arg88, %arg89, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> %alloc_32 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_31, %arg88, %arg89, %alloc_32) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + byre.compute @ConvOp_f16f16_f16(%37#0, %17, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16> %alloc_33 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%35#0, %17, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_32, %arg78, %arg79, %alloc_33) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %38:2 = call @Unknown55(%alloc_33) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_34 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_33, %arg78, %arg79, %alloc_34) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %36:2 = call @Unknown53(%alloc_34) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%38#0, %18, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_35 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%36#0, %18, %alloc_35) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_34, %arg83, %arg84, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %39:2 = call @Unknown57(%alloc_35, %alloc_31) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_36 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_35, %arg83, %arg84, %alloc_36) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %37:2 = call @Unknown55(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%39#0, %19, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_37 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%37#0, %19, %alloc_37) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_36, %arg93, %arg94, %alloc_37) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %40:2 = call @Unknown55(%alloc_37) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) %alloc_38 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_37, %arg93, %arg94, %alloc_38) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %38:2 = call @Unknown57(%alloc_38) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @ConvOp_f16f16_f16(%40#0, %20, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> %alloc_39 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvOp_f16f16_f16(%38#0, %20, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_40 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_39, %arg98, %arg99, %alloc_40) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> - %39:2 = call @Unknown59(%alloc_40, %37#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_38, %arg98, %arg99, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16> + %41:2 = call @Unknown57(%alloc_39, %39#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) + %42 = call @Unknown62(%41#0) : (memref<4x512x7x7xf16>) -> memref<4x512xf16> + %43 = call @Unknown63(%42) : (memref<4x512xf16>) -> memref<4x512xf16> + %alloc_40 = memref.alloc() : memref<4x1000xf16> + byre.compute @MatmulOp_f16f16_f16(%43, %22, %alloc_40) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16> + %44 = call @Unknown64(%23, %alloc_40) : (memref<1000xf16>, memref<4x1000xf16>) -> memref<4x1000xf16> + %45 = call @Unknown65(%44) : (memref<4x1000xf16>) -> memref<4xf16> + %46 = call @Unknown66(%45, %44) : (memref<4xf16>, memref<4x1000xf16>) -> memref<4x1000xf16> + %47 = call @Unknown67(%46) : (memref<4x1000xf16>) -> memref<4xf16> + %48 = call @Unknown68(%47) : (memref<4xf16>) -> memref<4xf16> + %49:2 = call @Unknown69(%48, %46, %24, %21) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) %alloc_41 = memref.alloc() : memref<4x512xf16> - byre.compute @ReduceSumOp_f16_f16(%39#0, %alloc_41) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<4x512xf16> - %40 = call @Unknown60(%alloc_41) : (memref<4x512xf16>) -> memref<4x512xf16> - %alloc_42 = memref.alloc() : memref<4x1000xf16> - byre.compute @MatmulOp_f16f16_f16(%40, %22, %alloc_42) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16> - %41 = call @Unknown61(%arg103, %alloc_42) : (memref<1000xf32>, memref<4x1000xf16>) -> memref<4x1000xf16> - %alloc_43 = memref.alloc() : memref<4xf16> - byre.compute @ReduceMaxOp_f16_f16(%41, %alloc_43) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %42:2 = call @Unknown62(%alloc_43, %41) : (memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) - %alloc_44 = memref.alloc() : memref<4xf16> - byre.compute @ReduceSumOp_f16_f16(%42#1, %alloc_44) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16> - %43:3 = call @Unknown63(%alloc_44, %42#0, %alloc_1, %21, %arg1) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>, memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) - %alloc_45 = memref.alloc() : memref<4x512xf16> - byre.compute @MatmulOp_f16f16_f16(%43#0, %22, %alloc_45) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16> - %44 = call @Unknown64(%alloc_45, %39#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> - %alloc_46 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_47 = memref.alloc() : memref<512xf32> + byre.compute @MatmulOp_f16f16_f16(%49#1, %22, %alloc_41) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16> + %50 = call @Unknown70(%alloc_41, %41#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> + %alloc_42 = memref.alloc() : memref<4x512x7x7xf16> + %alloc_43 = memref.alloc() : memref<512xf32> + %alloc_44 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_38, %arg98, %50, %alloc_42, %alloc_43, %alloc_44) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_45 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_42, %20, %alloc_45) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_46 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%40#0, %alloc_42, %alloc_46) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %51 = call @Unknown74(%40#1, %alloc_45) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> + %alloc_47 = memref.alloc() : memref<4x512x7x7xf16> %alloc_48 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %44, %alloc_46, %alloc_47, %alloc_48) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_49 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_46, %20, %alloc_49) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_50 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %45 = call @Unknown68(%38#1, %alloc_49) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> - %alloc_51 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_52 = memref.alloc() : memref<512xf32> + %alloc_49 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_36, %arg93, %51, %alloc_47, %alloc_48, %alloc_49) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_50 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_47, %19, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_51 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%39#0, %alloc_47, %alloc_51) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %52 = call @Unknown78(%50, %alloc_50, %39#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> + %alloc_52 = memref.alloc() : memref<4x512x7x7xf16> %alloc_53 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %45, %alloc_51, %alloc_52, %alloc_53) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_54 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_51, %19, %alloc_54) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_55 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %46 = call @Unknown72(%44, %alloc_54, %37#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> - %alloc_56 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_57 = memref.alloc() : memref<512xf32> + %alloc_54 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_34, %arg83, %52, %alloc_52, %alloc_53, %alloc_54) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_55 = memref.alloc() : memref<4x512x7x7xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_52, %18, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> + %alloc_56 = memref.alloc() : memref<512x512x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_52, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> + %53 = call @Unknown74(%38#1, %alloc_55) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> + %alloc_57 = memref.alloc() : memref<4x512x7x7xf16> %alloc_58 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %46, %alloc_56, %alloc_57, %alloc_58) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_59 = memref.alloc() : memref<4x512x7x7xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_56, %18, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16> - %alloc_60 = memref.alloc() : memref<512x512x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16> - %47 = call @Unknown76(%36#1, %alloc_59) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> - %alloc_61 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_62 = memref.alloc() : memref<512xf32> + %alloc_59 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_32, %arg78, %53, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_60 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_57, %17, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_61 = memref.alloc() : memref<512x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_57, %alloc_61) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16> + %alloc_62 = memref.alloc() : memref<4x512x7x7xf16> %alloc_63 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %47, %alloc_61, %alloc_62, %alloc_63) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_64 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_61, %17, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_65 = memref.alloc() : memref<512x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16> - %alloc_66 = memref.alloc() : memref<4x512x7x7xf16> - %alloc_67 = memref.alloc() : memref<512xf32> - %alloc_68 = memref.alloc() : memref<512xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %46, %alloc_66, %alloc_67, %alloc_68) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> - %alloc_69 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_66, %16, %alloc_69) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16> - %alloc_70 = memref.alloc() : memref<512x256x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16> - %48 = call @Unknown83(%alloc_69, %alloc_64, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> - %alloc_71 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_72 = memref.alloc() : memref<256xf32> + %alloc_64 = memref.alloc() : memref<512xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_30, %arg88, %52, %alloc_62, %alloc_63, %alloc_64) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32> + %alloc_65 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_62, %16, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16> + %alloc_66 = memref.alloc() : memref<512x256x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_62, %alloc_66) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16> + %54 = call @Unknown89(%alloc_65, %alloc_60, %37#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> + %alloc_67 = memref.alloc() : memref<4x256x14x14xf16> + %alloc_68 = memref.alloc() : memref<256xf32> + %alloc_69 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_28, %arg73, %54, %alloc_67, %alloc_68, %alloc_69) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_70 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_67, %15, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_71 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_67, %alloc_71) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %55 = call @Unknown93(%36#1, %alloc_70) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> + %alloc_72 = memref.alloc() : memref<4x256x14x14xf16> %alloc_73 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %48, %alloc_71, %alloc_72, %alloc_73) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_74 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_71, %15, %alloc_74) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_75 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %49 = call @Unknown87(%34#1, %alloc_74) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> - %alloc_76 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_77 = memref.alloc() : memref<256xf32> + %alloc_74 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_26, %arg68, %55, %alloc_72, %alloc_73, %alloc_74) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_75 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_72, %14, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_76 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_72, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %56 = call @Unknown89(%54, %alloc_75, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> + %alloc_77 = memref.alloc() : memref<4x256x14x14xf16> %alloc_78 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %49, %alloc_76, %alloc_77, %alloc_78) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_79 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_76, %14, %alloc_79) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_80 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %50 = call @Unknown91(%48, %alloc_79, %33#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> - %alloc_81 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_82 = memref.alloc() : memref<256xf32> + %alloc_79 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_24, %arg58, %56, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_80 = memref.alloc() : memref<4x256x14x14xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_77, %13, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> + %alloc_81 = memref.alloc() : memref<256x256x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_77, %alloc_81) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> + %57 = call @Unknown93(%34#1, %alloc_80) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> + %alloc_82 = memref.alloc() : memref<4x256x14x14xf16> %alloc_83 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %50, %alloc_81, %alloc_82, %alloc_83) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_84 = memref.alloc() : memref<4x256x14x14xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_81, %13, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16> - %alloc_85 = memref.alloc() : memref<256x256x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16> - %51 = call @Unknown95(%32#1, %alloc_84) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> - %alloc_86 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_87 = memref.alloc() : memref<256xf32> + %alloc_84 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_22, %arg53, %57, %alloc_82, %alloc_83, %alloc_84) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_85 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_82, %12, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_86 = memref.alloc() : memref<256x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_82, %alloc_86) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16> + %alloc_87 = memref.alloc() : memref<4x256x14x14xf16> %alloc_88 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %51, %alloc_86, %alloc_87, %alloc_88) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_89 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_86, %12, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_90 = memref.alloc() : memref<256x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16> - %alloc_91 = memref.alloc() : memref<4x256x14x14xf16> - %alloc_92 = memref.alloc() : memref<256xf32> - %alloc_93 = memref.alloc() : memref<256xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %50, %alloc_91, %alloc_92, %alloc_93) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> - %alloc_94 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_91, %11, %alloc_94) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16> - %alloc_95 = memref.alloc() : memref<256x128x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16> - %52 = call @Unknown102(%alloc_94, %alloc_89, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> - %alloc_96 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_97 = memref.alloc() : memref<128xf32> + %alloc_89 = memref.alloc() : memref<256xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_20, %arg63, %56, %alloc_87, %alloc_88, %alloc_89) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32> + %alloc_90 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_87, %11, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16> + %alloc_91 = memref.alloc() : memref<256x128x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_87, %alloc_91) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16> + %58 = call @Unknown108(%alloc_90, %alloc_85, %33#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> + %alloc_92 = memref.alloc() : memref<4x128x28x28xf16> + %alloc_93 = memref.alloc() : memref<128xf32> + %alloc_94 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_18, %arg48, %58, %alloc_92, %alloc_93, %alloc_94) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_95 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_92, %10, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_96 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_92, %alloc_96) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %59 = call @Unknown112(%32#1, %alloc_95) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> + %alloc_97 = memref.alloc() : memref<4x128x28x28xf16> %alloc_98 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %52, %alloc_96, %alloc_97, %alloc_98) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_99 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_96, %10, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_100 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %53 = call @Unknown106(%30#1, %alloc_99) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> - %alloc_101 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_102 = memref.alloc() : memref<128xf32> + %alloc_99 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_16, %arg43, %59, %alloc_97, %alloc_98, %alloc_99) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_100 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_97, %9, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_101 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_97, %alloc_101) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %60 = call @Unknown108(%58, %alloc_100, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> + %alloc_102 = memref.alloc() : memref<4x128x28x28xf16> %alloc_103 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %53, %alloc_101, %alloc_102, %alloc_103) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_104 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_101, %9, %alloc_104) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_105 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %54 = call @Unknown110(%52, %alloc_104, %29#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> - %alloc_106 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_107 = memref.alloc() : memref<128xf32> + %alloc_104 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_14, %arg33, %60, %alloc_102, %alloc_103, %alloc_104) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_105 = memref.alloc() : memref<4x128x28x28xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_102, %8, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> + %alloc_106 = memref.alloc() : memref<128x128x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_102, %alloc_106) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> + %61 = call @Unknown112(%30#1, %alloc_105) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> + %alloc_107 = memref.alloc() : memref<4x128x28x28xf16> %alloc_108 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %54, %alloc_106, %alloc_107, %alloc_108) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_109 = memref.alloc() : memref<4x128x28x28xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_106, %8, %alloc_109) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16> - %alloc_110 = memref.alloc() : memref<128x128x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16> - %55 = call @Unknown114(%28#1, %alloc_109) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> - %alloc_111 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_112 = memref.alloc() : memref<128xf32> + %alloc_109 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_12, %arg28, %61, %alloc_107, %alloc_108, %alloc_109) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_110 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_107, %7, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_111 = memref.alloc() : memref<128x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_107, %alloc_111) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16> + %alloc_112 = memref.alloc() : memref<4x128x28x28xf16> %alloc_113 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %55, %alloc_111, %alloc_112, %alloc_113) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_114 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_111, %7, %alloc_114) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_115 = memref.alloc() : memref<128x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16> - %alloc_116 = memref.alloc() : memref<4x128x28x28xf16> - %alloc_117 = memref.alloc() : memref<128xf32> - %alloc_118 = memref.alloc() : memref<128xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %54, %alloc_116, %alloc_117, %alloc_118) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> - %alloc_119 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_116, %6, %alloc_119) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16> - %alloc_120 = memref.alloc() : memref<128x64x1x1xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16> - %56 = call @Unknown121(%alloc_119, %alloc_114, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> - %alloc_121 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_122 = memref.alloc() : memref<64xf32> + %alloc_114 = memref.alloc() : memref<128xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_10, %arg38, %60, %alloc_112, %alloc_113, %alloc_114) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32> + %alloc_115 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_112, %6, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16> + %alloc_116 = memref.alloc() : memref<128x64x1x1xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_112, %alloc_116) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16> + %62 = call @Unknown127(%alloc_115, %alloc_110, %29#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> + %alloc_117 = memref.alloc() : memref<4x64x56x56xf16> + %alloc_118 = memref.alloc() : memref<64xf32> + %alloc_119 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_8, %arg23, %62, %alloc_117, %alloc_118, %alloc_119) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_120 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_117, %5, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_121 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_117, %alloc_121) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %63 = call @Unknown131(%28#1, %alloc_120) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_122 = memref.alloc() : memref<4x64x56x56xf16> %alloc_123 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %56, %alloc_121, %alloc_122, %alloc_123) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_124 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_121, %5, %alloc_124) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_125 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %57 = call @Unknown125(%26#1, %alloc_124) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_126 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_127 = memref.alloc() : memref<64xf32> + %alloc_124 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_6, %arg18, %63, %alloc_122, %alloc_123, %alloc_124) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_125 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_122, %4, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_126 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_122, %alloc_126) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %64 = call @Unknown127(%62, %alloc_125, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> + %alloc_127 = memref.alloc() : memref<4x64x56x56xf16> %alloc_128 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %57, %alloc_126, %alloc_127, %alloc_128) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_129 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_126, %4, %alloc_129) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_130 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %58 = call @Unknown129(%56, %alloc_129, %25#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> - %alloc_131 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_132 = memref.alloc() : memref<64xf32> + %alloc_129 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_4, %arg13, %64, %alloc_127, %alloc_128, %alloc_129) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_130 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_127, %3, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_131 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_127, %alloc_131) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %65 = call @Unknown131(%26#1, %alloc_130) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_132 = memref.alloc() : memref<4x64x56x56xf16> %alloc_133 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %58, %alloc_131, %alloc_132, %alloc_133) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_134 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_131, %3, %alloc_134) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_135 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%24#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %59 = call @Unknown133(%24#1, %alloc_134) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_136 = memref.alloc() : memref<4x64x56x56xf16> - %alloc_137 = memref.alloc() : memref<64xf32> - %alloc_138 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %59, %alloc_136, %alloc_137, %alloc_138) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> - %alloc_139 = memref.alloc() : memref<4x64x56x56xf16> - byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_136, %2, %alloc_139) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> - %alloc_140 = memref.alloc() : memref<64x64x3x3xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_2, %alloc_136, %alloc_140) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> - %60 = call @Unknown137(%58, %alloc_139) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> - %alloc_141 = memref.alloc() : memref<4x64x112x112xf16> - byre.compute @PoolMaxGradOp_f16f16_f16(%23#0, %60, %alloc_141) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16> - %61 = call @Unknown138(%23#1, %alloc_141) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> - %alloc_142 = memref.alloc() : memref<4x64x112x112xf16> - %alloc_143 = memref.alloc() : memref<64xf32> - %alloc_144 = memref.alloc() : memref<64xf32> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %61, %alloc_142, %alloc_143, %alloc_144) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32> - %alloc_145 = memref.alloc() : memref<64x3x7x7xf16> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_142, %alloc_145) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16> - %alloc_146 = memref.alloc() : memref - byre.compute @ReduceSumOp_f32_f32(%43#1, %alloc_146) {dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref - %62 = call @Unknown141(%alloc_146) : (memref) -> memref - %63 = call @Unknown142(%alloc_145) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> - %64 = call @Unknown143(%alloc_140) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %65 = call @Unknown144(%alloc_135) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %66 = call @Unknown145(%alloc_130) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %67 = call @Unknown146(%alloc_125) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> - %68 = call @Unknown147(%alloc_115) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> - %69 = call @Unknown148(%alloc_110) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %70 = call @Unknown149(%alloc_120) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> - %71 = call @Unknown150(%alloc_105) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %72 = call @Unknown151(%alloc_100) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> - %73 = call @Unknown152(%alloc_90) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> - %74 = call @Unknown153(%alloc_85) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %75 = call @Unknown154(%alloc_95) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> - %76 = call @Unknown155(%alloc_80) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %77 = call @Unknown156(%alloc_75) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> - %78 = call @Unknown157(%alloc_65) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> - %79 = call @Unknown158(%alloc_60) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %80 = call @Unknown159(%alloc_70) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> - %81 = call @Unknown160(%alloc_55) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %82 = call @Unknown161(%alloc_50) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> - %alloc_147 = memref.alloc() : memref<1000x512xf16> - byre.compute @MatmulOp_f16f16_f16(%40, %43#0, %alloc_147) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16> - %83 = call @Unknown163(%alloc_147) : (memref<1000x512xf16>) -> memref<1000x512xf32> - %alloc_148 = memref.alloc() : memref<1000xf32> - byre.compute @ReduceSumOp_f32_f32(%43#2, %alloc_148) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<1000xf32> - %84 = call @Unknown164(%alloc_148) : (memref<1000xf32>) -> memref<1000xf32> - return %62, %63, %alloc_143, %alloc_144, %64, %alloc_137, %alloc_138, %65, %alloc_132, %alloc_133, %66, %alloc_127, %alloc_128, %67, %alloc_122, %alloc_123, %68, %alloc_112, %alloc_113, %69, %alloc_107, %alloc_108, %70, %alloc_117, %alloc_118, %71, %alloc_102, %alloc_103, %72, %alloc_97, %alloc_98, %73, %alloc_87, %alloc_88, %74, %alloc_82, %alloc_83, %75, %alloc_92, %alloc_93, %76, %alloc_77, %alloc_78, %77, %alloc_72, %alloc_73, %78, %alloc_62, %alloc_63, %79, %alloc_57, %alloc_58, %80, %alloc_67, %alloc_68, %81, %alloc_52, %alloc_53, %82, %alloc_47, %alloc_48, %83, %84 : memref, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32> + %alloc_134 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_2, %arg8, %65, %alloc_132, %alloc_133, %alloc_134) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32> + %alloc_135 = memref.alloc() : memref<4x64x56x56xf16> + byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_132, %2, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16> + %alloc_136 = memref.alloc() : memref<64x64x3x3xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_1, %alloc_132, %alloc_136) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16> + %66 = call @Unknown143(%64, %alloc_135) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> + %alloc_137 = memref.alloc() : memref<4x64x112x112xf16> + byre.compute @PoolMaxGradOp_f16f16_f16(%25#0, %66, %alloc_137) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16> + %67 = call @Unknown144(%25#1, %alloc_137) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> + %alloc_138 = memref.alloc() : memref<4x64x112x112xf16> + %alloc_139 = memref.alloc() : memref<64xf32> + %alloc_140 = memref.alloc() : memref<64xf32> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %67, %alloc_138, %alloc_139, %alloc_140) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32> + %alloc_141 = memref.alloc() : memref<64x3x7x7xf16> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_138, %alloc_141) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16> + %68 = call @Unknown147(%49#0, %arg1) : (memref<4x1000xf16>, memref<4x1000xf32>) -> memref + %69 = call @Unknown148(%68) : (memref) -> memref + %70 = call @Unknown149(%alloc_141) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> + %71 = call @Unknown150(%alloc_136) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %72 = call @Unknown150(%alloc_131) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %73 = call @Unknown150(%alloc_126) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %74 = call @Unknown150(%alloc_121) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> + %75 = call @Unknown154(%alloc_111) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> + %76 = call @Unknown155(%alloc_106) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %77 = call @Unknown156(%alloc_116) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> + %78 = call @Unknown155(%alloc_101) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %79 = call @Unknown155(%alloc_96) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> + %80 = call @Unknown159(%alloc_86) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> + %81 = call @Unknown160(%alloc_81) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %82 = call @Unknown161(%alloc_91) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> + %83 = call @Unknown160(%alloc_76) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %84 = call @Unknown160(%alloc_71) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> + %85 = call @Unknown164(%alloc_61) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> + %86 = call @Unknown165(%alloc_56) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %87 = call @Unknown166(%alloc_66) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> + %88 = call @Unknown165(%alloc_51) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %89 = call @Unknown165(%alloc_46) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> + %alloc_142 = memref.alloc() : memref<1000x512xf16> + byre.compute @MatmulOp_f16f16_f16(%43, %49#1, %alloc_142) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16> + %90 = call @Unknown170(%alloc_142) : (memref<1000x512xf16>) -> memref<1000x512xf32> + %91 = call @Unknown171(%49#1) : (memref<4x1000xf16>) -> memref<1000xf32> + %92 = call @Unknown172(%91) : (memref<1000xf32>) -> memref<1000xf32> + return %69, %70, %alloc_139, %alloc_140, %71, %alloc_133, %alloc_134, %72, %alloc_128, %alloc_129, %73, %alloc_123, %alloc_124, %74, %alloc_118, %alloc_119, %75, %alloc_108, %alloc_109, %76, %alloc_103, %alloc_104, %77, %alloc_113, %alloc_114, %78, %alloc_98, %alloc_99, %79, %alloc_93, %alloc_94, %80, %alloc_83, %alloc_84, %81, %alloc_78, %alloc_79, %82, %alloc_88, %alloc_89, %83, %alloc_73, %alloc_74, %84, %alloc_68, %alloc_69, %85, %alloc_58, %alloc_59, %86, %alloc_53, %alloc_54, %87, %alloc_63, %alloc_64, %88, %alloc_48, %alloc_49, %89, %alloc_43, %alloc_44, %90, %92 : memref, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/Whole/8_byre_opt.mlir b/compiler/test/E2E/ResNet18/Whole/8_byre_opt.mlir index 551a80c98..2da0299c3 100644 --- a/compiler/test/E2E/ResNet18/Whole/8_byre_opt.mlir +++ b/compiler/test/E2E/ResNet18/Whole/8_byre_opt.mlir @@ -4,4058 +4,2191 @@ module @IrToMhlo.2452 attributes {gpu.container_module} { gpu.module @unified { - gpu.func @Unknown164(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { + gpu.func @Unknown172(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<1000xf32> - %7 = arith.truncf %6 : f32 to f16 - %8 = arith.extf %7 : f16 to f32 - memref.store %8, %arg1[%4] : memref<1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg2] : memref<1000xf32> + %8 = arith.truncf %7 : f32 to f16 + %9 = arith.extf %8 : f16 to f32 + memref.store %9, %arg1[%arg2] : memref<1000xf32> } gpu.return } - gpu.func @Unknown163(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown170(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf32> } gpu.return } - gpu.func @Unknown161(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { + gpu.func @Unknown166(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { + %c131072 = arith.constant 131072 : index %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index + %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> } gpu.return } - gpu.func @Unknown160(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown165(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32> } gpu.return } - gpu.func @Unknown159(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c131072 = arith.constant 131072 : index + gpu.func @Unknown164(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { + %c1179648 = arith.constant 1179648 : index %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32> } gpu.return } - gpu.func @Unknown158(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { + gpu.func @Unknown161(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { + %c32768 = arith.constant 32768 : index %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index + %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> } gpu.return } - gpu.func @Unknown157(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown160(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { + %c589824 = arith.constant 589824 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32> } gpu.return } - gpu.func @Unknown156(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index + gpu.func @Unknown159(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { + %c294912 = arith.constant 294912 : index + %c128 = arith.constant 128 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32> } gpu.return } - gpu.func @Unknown155(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { + gpu.func @Unknown156(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { + %c8192 = arith.constant 8192 : index %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> } gpu.return } - gpu.func @Unknown154(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c32768 = arith.constant 32768 : index + gpu.func @Unknown155(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { + %c147456 = arith.constant 147456 : index %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32> } gpu.return } - gpu.func @Unknown153(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index + gpu.func @Unknown154(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { + %c73728 = arith.constant 73728 : index + %c64 = arith.constant 64 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32> } gpu.return } - gpu.func @Unknown152(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index + gpu.func @Unknown150(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { + %c36864 = arith.constant 36864 : index + %c64 = arith.constant 64 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32> } gpu.return } - gpu.func @Unknown151(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index + gpu.func @Unknown149(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { + %c9408 = arith.constant 9408 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32> } gpu.return } - gpu.func @Unknown150(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown148(%arg0: memref, %arg1: memref) kernel { + %c1 = arith.constant 1 : index + %cst = arith.constant 4.000000e+00 : f32 %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1 step %6 { + %7 = memref.load %arg0[] : memref + %8 = arith.negf %7 : f32 + %9 = arith.divf %8, %cst : f32 + memref.store %9, %arg1[] : memref } gpu.return } - gpu.func @Unknown149(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c8192 = arith.constant 8192 : index + gpu.func @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel { + %c3211264 = arith.constant 3211264 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index + %c112 = arith.constant 112 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c3211264 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xf16> } gpu.return } - gpu.func @Unknown148(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.addf %13, %14 : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown147(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown146(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown145(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { + %c401408 = arith.constant 401408 : index + %cst = arith.constant 0.000000e+00 : f16 + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16> } gpu.return } - gpu.func @Unknown144(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + %c401408 = arith.constant 401408 : index + %cst = arith.constant 0.000000e+00 : f16 + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xf16> } gpu.return } - gpu.func @Unknown143(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { + %c200704 = arith.constant 200704 : index + %cst = arith.constant 0.000000e+00 : f16 + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16> } gpu.return } - gpu.func @Unknown142(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index + gpu.func @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + %c200704 = arith.constant 200704 : index + %cst = arith.constant 0.000000e+00 : f16 + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xf16> } gpu.return } - gpu.func @Unknown141(%arg0: memref, %arg1: memref) kernel { - %cst = arith.constant 4.000000e+00 : f32 - %c1 = arith.constant 1 : index + gpu.func @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index + %cst = arith.constant 0.000000e+00 : f16 + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1 : index - scf.if %5 { - %6 = memref.load %arg0[] : memref - %7 = arith.negf %6 : f32 - %8 = arith.divf %7, %cst : f32 - memref.store %8, %arg1[] : memref + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel { + gpu.func @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c3211264 = arith.constant 3211264 : index - %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c3211264 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index + %cst = arith.constant 4.900000e+01 : f16 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11] : memref<4x512xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %15 = arith.divf %13, %cst : f16 + %16 = arith.select %14, %15, %cst_0 : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf16>, %arg5: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg6 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg6, %c1000 : index + %8 = arith.divsi %arg6, %c1000 : index + %9 = memref.load %arg2[%8] : memref<4xf16> + %10 = memref.load %arg0[%8] : memref<4xf16> + %11 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %12 = memref.load %arg3[%8, %7] : memref<4x1000xf16> + %13 = arith.subf %11, %10 : f16 + %14 = math.exp %13 : f16 + %15 = arith.mulf %14, %9 : f16 + %16 = arith.subf %12, %15 : f16 + memref.store %13, %arg4[%8, %7] : memref<4x1000xf16> + memref.store %16, %arg5[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown68(%arg0: memref<4xf16>, %arg1: memref<4xf16>) kernel { + %c4 = arith.constant 4 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c4 step %6 { + %7 = memref.load %arg0[%arg2] : memref<4xf16> + %8 = math.log %7 : f16 + memref.store %8, %arg1[%arg2] : memref<4xf16> } gpu.return } - gpu.func @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg3, %c1000 : index + %8 = arith.divsi %arg3, %c1000 : index + %9 = memref.load %arg0[%8] : memref<4xf16> + %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %11 = arith.subf %10, %9 : f16 + memref.store %11, %arg2[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg3, %c1000 : index + %8 = arith.divsi %arg3, %c1000 : index + %9 = memref.load %arg0[%7] : memref<1000xf16> + %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %11 = arith.addf %10, %9 : f16 + memref.store %11, %arg2[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown63(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel { + %c2048 = arith.constant 2048 : index + %cst = arith.constant 2.040100e-02 : f16 + %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2048 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<4x512xf16> + %10 = arith.mulf %9, %cst : f16 + memref.store %10, %arg1[%8, %7] : memref<4x512xf16> } gpu.return } - gpu.func @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xi1> } gpu.return } - gpu.func @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1> } gpu.return } - gpu.func @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xi1> } gpu.return } - gpu.func @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index + gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index + %cst = arith.constant 0.000000e+00 : f16 %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1> } gpu.return } - gpu.func @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { + %c401408 = arith.constant 401408 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xi1> } gpu.return } - gpu.func @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { + %c401408 = arith.constant 401408 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1> } gpu.return } - gpu.func @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xi1> } gpu.return } - gpu.func @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { + gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant 4.900000e+01 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg0[%35, %29] : memref<4x512xf16> - %38 = arith.divf %37, %cst_0 : f16 - %39 = arith.select %36, %38, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>, %arg5: memref<4x1000xf16>, %arg6: memref<4x1000xf32>, %arg7: memref<4x1000xf32>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg3[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %18 = memref.load %arg0[%15] : memref<4xf16> - %19 = memref.load %arg2[%15] : memref<4xf16> - %20 = memref.load %arg4[%15, %9] : memref<4x1000xf32> - %21 = math.log %18 : f16 - %22 = arith.subf %17, %21 : f16 - %23 = math.exp %22 : f16 - %24 = arith.mulf %23, %19 : f16 - %25 = arith.subf %16, %24 : f16 - %26 = arith.extf %22 : f16 to f32 - %27 = arith.mulf %26, %20 : f32 - %28 = arith.extf %25 : f16 to f32 - memref.store %25, %arg5[%15, %9] : memref<4x1000xf16> - memref.store %27, %arg6[%15, %9] : memref<4x1000xf32> - memref.store %28, %arg7[%15, %9] : memref<4x1000xf32> - } - gpu.return - } - gpu.func @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>, %arg3: memref<4x1000xf16>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg0[%15] : memref<4xf16> - %18 = arith.subf %16, %17 : f16 - %19 = math.exp %18 : f16 - memref.store %18, %arg2[%15, %9] : memref<4x1000xf16> - memref.store %19, %arg3[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg0[%9] : memref<1000xf32> - %18 = arith.truncf %17 : f32 to f16 - %19 = arith.addf %16, %18 : f16 - memref.store %19, %arg2[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown60(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel { - %cst = arith.constant 2.040100e-02 : f16 - %c0 = arith.constant 0 : index - %c2048 = arith.constant 2048 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2048 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<4x512xf16> - %17 = arith.mulf %16, %cst : f16 - memref.store %17, %arg1[%15, %9] : memref<4x512xf16> - } - gpu.return - } - gpu.func @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown53(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown44(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown35(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1> - } - gpu.return - } - gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - } - gpu.return - } - gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1> } gpu.return } - gpu.func @Unknown26(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { + gpu.func @Unknown26(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel { + %c3211264 = arith.constant 3211264 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c3211264 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xi1> } gpu.return } - gpu.func @Unknown24(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c3211264 = arith.constant 3211264 : index - %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown24(%arg0: memref<1000xf32>, %arg1: memref<1000xf16>) kernel { + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c3211264 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg2] : memref<1000xf32> + %8 = arith.truncf %7 : f32 to f16 + memref.store %8, %arg1[%arg2] : memref<1000xf16> } gpu.return } gpu.func @Unknown23(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { - %c0 = arith.constant 0 : index %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf16> } gpu.return } gpu.func @Unknown22(%arg0: memref<4x1000xf32>, %arg1: memref<4x1000xf16>) kernel { - %cst = arith.constant -2.500000e-01 : f32 - %c0 = arith.constant 0 : index %c4000 = arith.constant 4000 : index + %cst = arith.constant -2.500000e-01 : f32 %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<4x1000xf32> - %17 = arith.mulf %16, %cst : f32 - %18 = arith.truncf %17 : f32 to f16 - memref.store %18, %arg1[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown21(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown20(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg2, %c1000 : index + %8 = arith.divsi %arg2, %c1000 : index + %9 = memref.load %arg0[%8, %7] : memref<4x1000xf32> + %10 = arith.mulf %9, %cst : f32 + %11 = arith.truncf %10 : f32 to f16 + memref.store %11, %arg1[%8, %7] : memref<4x1000xf16> } gpu.return } gpu.func @Unknown19(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16> } gpu.return } gpu.func @Unknown18(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16> } gpu.return } gpu.func @Unknown17(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c131072 = arith.constant 131072 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - } - gpu.return - } - gpu.func @Unknown16(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown15(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> } gpu.return } gpu.func @Unknown14(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16> } gpu.return } gpu.func @Unknown13(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c294912 = arith.constant 294912 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16> } gpu.return } gpu.func @Unknown12(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c32768 = arith.constant 32768 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - } - gpu.return - } - gpu.func @Unknown11(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown10(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> } gpu.return } gpu.func @Unknown9(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16> } gpu.return } gpu.func @Unknown8(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16> } gpu.return } gpu.func @Unknown7(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> } gpu.return } - gpu.func @Unknown6(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16> } gpu.return } - gpu.func @Unknown5(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index + gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { + %c9408 = arith.constant 9408 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16> } gpu.return } - gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index + gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel { + %c602112 = arith.constant 602112 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + %c224 = arith.constant 224 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c602112 step %6 { + %7 = arith.remsi %arg2, %c224 : index + %8 = arith.divsi %arg2, %c224 : index + %9 = arith.remsi %8, %c224 : index + %10 = arith.divsi %8, %c224 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x3x224x224xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x3x224x224xf16> } gpu.return } - gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index + gpu.func @Unknown25_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %1, %c2 : index + %3 = arith.cmpi slt, %1, %c0 : index + %4 = arith.subi %c-1, %1 : index + %5 = arith.select %3, %4, %1 : index + %6 = arith.divsi %5, %c512 : index + %7 = arith.subi %c-1, %6 : index + %8 = arith.select %3, %7, %6 : index + %9 = arith.muli %8, %c-1024 : index + %10 = arith.addi %2, %9 : index + %11 = arith.cmpi slt, %10, %c1000 : index + %12 = arith.select %11, %10, %c1000 : index + %13 = arith.addi %10, %c2 : index + %14 = arith.cmpi slt, %13, %c1000 : index + %15 = arith.select %14, %13, %c1000 : index + %16 = arith.subi %15, %12 : index + %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %17 = arith.cmpi ugt, %16, %c0 : index + %18 = scf.if %17 -> (f16) { + %32 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %32 : f16 + } else { + scf.yield %cst : f16 + } + %19 = arith.addf %18, %cst : f16 + %20 = arith.cmpi ugt, %16, %c1 : index + %21 = scf.if %20 -> (f16) { + %32 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %32 : f16 + } else { + scf.yield %cst : f16 + } + %22 = arith.addf %19, %21 : f16 + memref.store %22, %alloca[%1] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %23 = arith.cmpi ult, %1, %c256 : index + scf.if %23 { + %32 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca[%34] : memref<512xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_2[%1] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %24 = arith.cmpi ult, %1, %c128 : index + scf.if %24 { + %32 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_2[%34] : memref<256xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_3[%1] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %25 = arith.cmpi ult, %1, %c64 : index + scf.if %25 { + %32 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_3[%34] : memref<128xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_4[%1] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %26 = arith.cmpi ult, %1, %c32 : index + scf.if %26 { + %32 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_4[%34] : memref<64xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_5[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %27 = arith.cmpi ult, %1, %c16 : index + scf.if %27 { + %32 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_5[%34] : memref<32xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_6[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %28 = arith.cmpi ult, %1, %c8 : index + scf.if %28 { + %32 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_6[%34] : memref<16xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_7[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %29 = arith.cmpi ult, %1, %c4 : index + scf.if %29 { + %32 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_7[%34] : memref<8xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_8[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %30 = arith.cmpi ult, %1, %c2 : index + scf.if %30 { + %32 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_8[%34] : memref<4xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_9[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %31 = arith.cmpi ult, %1, %c1 : index + scf.if %31 { + %32 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_9[%34] : memref<2xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %arg1[%0] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown62_kernel(%arg0: memref<2048x49xf16>, %arg1: memref<2048xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c49 = arith.constant 49 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index + %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.remsi %1, %c64 : index + %3 = arith.cmpi slt, %2, %c0 : index + %4 = arith.addi %2, %c64 : index + %5 = arith.select %3, %4, %2 : index + %6 = arith.cmpi slt, %5, %c49 : index + %7 = arith.select %6, %5, %c49 : index + %8 = arith.addi %5, %c1 : index + %9 = arith.cmpi slt, %8, %c49 : index + %10 = arith.select %9, %8, %c49 : index + %11 = arith.subi %10, %7 : index + %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %12 = arith.cmpi ugt, %11, %c0 : index + %13 = scf.if %12 -> (f16) { + %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %21 : f16 + } else { + scf.yield %cst : f16 + } + %14 = arith.addf %13, %cst : f16 + memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space> + %15 = arith.cmpi ult, %1, %c32 : index + scf.if %15 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space> + %16 = arith.cmpi ult, %1, %c16 : index + scf.if %16 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space> + %17 = arith.cmpi ult, %1, %c8 : index + scf.if %17 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space> + %18 = arith.cmpi ult, %1, %c4 : index + scf.if %18 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space> + %19 = arith.cmpi ult, %1, %c2 : index + scf.if %19 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %20 = arith.cmpi ult, %1, %c1 : index + scf.if %20 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %arg1[%0] : memref<2048xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown65_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16> - } - gpu.return - } - gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel { %c0 = arith.constant 0 : index - %c602112 = arith.constant 602112 : index - %c224 = arith.constant 224 : index + %c2 = arith.constant 2 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %1, %c2 : index + %3 = arith.cmpi slt, %1, %c0 : index + %4 = arith.subi %c-1, %1 : index + %5 = arith.select %3, %4, %1 : index + %6 = arith.divsi %5, %c512 : index + %7 = arith.subi %c-1, %6 : index + %8 = arith.select %3, %7, %6 : index + %9 = arith.muli %8, %c-1024 : index + %10 = arith.addi %2, %9 : index + %11 = arith.cmpi slt, %10, %c1000 : index + %12 = arith.select %11, %10, %c1000 : index + %13 = arith.addi %10, %c2 : index + %14 = arith.cmpi slt, %13, %c1000 : index + %15 = arith.select %14, %13, %c1000 : index + %16 = arith.subi %15, %12 : index + %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %17 = arith.cmpi ugt, %16, %c0 : index + %18 = scf.if %17 -> (f16) { + %31 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %31 : f16 + } else { + scf.yield %cst : f16 + } + %19 = arith.cmpi ugt, %16, %c1 : index + %20 = scf.if %19 -> (f16) { + %31 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %31 : f16 + } else { + scf.yield %cst : f16 + } + %21 = arith.maximumf %18, %20 : f16 + memref.store %21, %alloca[%1] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %22 = arith.cmpi ult, %1, %c256 : index + scf.if %22 { + %31 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca[%32] : memref<512xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_2[%1] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %23 = arith.cmpi ult, %1, %c128 : index + scf.if %23 { + %31 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_2[%32] : memref<256xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_3[%1] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %24 = arith.cmpi ult, %1, %c64 : index + scf.if %24 { + %31 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_3[%32] : memref<128xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_4[%1] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %25 = arith.cmpi ult, %1, %c32 : index + scf.if %25 { + %31 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_4[%32] : memref<64xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_5[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %26 = arith.cmpi ult, %1, %c16 : index + scf.if %26 { + %31 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_5[%32] : memref<32xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_6[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %27 = arith.cmpi ult, %1, %c8 : index + scf.if %27 { + %31 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_6[%32] : memref<16xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_7[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %28 = arith.cmpi ult, %1, %c4 : index + scf.if %28 { + %31 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_7[%32] : memref<8xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_8[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %29 = arith.cmpi ult, %1, %c2 : index + scf.if %29 { + %31 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_8[%32] : memref<4xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_9[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %30 = arith.cmpi ult, %1, %c1 : index + scf.if %30 { + %31 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_9[%32] : memref<2xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %arg1[%0] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown67_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %1, %c2 : index + %3 = arith.cmpi slt, %1, %c0 : index + %4 = arith.subi %c-1, %1 : index + %5 = arith.select %3, %4, %1 : index + %6 = arith.divsi %5, %c512 : index + %7 = arith.subi %c-1, %6 : index + %8 = arith.select %3, %7, %6 : index + %9 = arith.muli %8, %c-1024 : index + %10 = arith.addi %2, %9 : index + %11 = arith.cmpi slt, %10, %c1000 : index + %12 = arith.select %11, %10, %c1000 : index + %13 = arith.addi %10, %c2 : index + %14 = arith.cmpi slt, %13, %c1000 : index + %15 = arith.select %14, %13, %c1000 : index + %16 = arith.subi %15, %12 : index + %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %17 = arith.cmpi ugt, %16, %c0 : index + %18 = scf.if %17 -> (f16) { + %34 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %34 : f16 + } else { + scf.yield %cst : f16 + } + %19 = math.exp %18 : f16 + %20 = arith.addf %19, %cst : f16 + %21 = arith.cmpi ugt, %16, %c1 : index + %22 = scf.if %21 -> (f16) { + %34 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %34 : f16 + } else { + scf.yield %cst : f16 + } + %23 = math.exp %22 : f16 + %24 = arith.addf %20, %23 : f16 + memref.store %24, %alloca[%1] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %25 = arith.cmpi ult, %1, %c256 : index + scf.if %25 { + %34 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca[%36] : memref<512xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_2[%1] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %26 = arith.cmpi ult, %1, %c128 : index + scf.if %26 { + %34 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_2[%36] : memref<256xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_3[%1] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %27 = arith.cmpi ult, %1, %c64 : index + scf.if %27 { + %34 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_3[%36] : memref<128xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_4[%1] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %28 = arith.cmpi ult, %1, %c32 : index + scf.if %28 { + %34 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_4[%36] : memref<64xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_5[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %29 = arith.cmpi ult, %1, %c16 : index + scf.if %29 { + %34 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_5[%36] : memref<32xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_6[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %30 = arith.cmpi ult, %1, %c8 : index + scf.if %30 { + %34 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_6[%36] : memref<16xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_7[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %31 = arith.cmpi ult, %1, %c4 : index + scf.if %31 { + %34 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_7[%36] : memref<8xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_8[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %32 = arith.cmpi ult, %1, %c2 : index + scf.if %32 { + %34 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_8[%36] : memref<4xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_9[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %33 = arith.cmpi ult, %1, %c1 : index + scf.if %33 { + %34 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_9[%36] : memref<2xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %arg1[%0] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown147_kernel(%arg0: memref<32x125xf16>, %arg1: memref<32x125xf32>, %arg2: memref<32xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c2 = arith.constant 2 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c125 = arith.constant 125 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c602112 : index - scf.if %5 { - %6 = arith.remsi %4, %c224 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c224 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c224 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c224 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c224 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c224 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x3x224x224xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x3x224x224xf16> - } + %subview = memref.subview %arg0[%0, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>> + %subview_1 = memref.subview %arg1[%0, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>> + %expand_shape_2 = memref.expand_shape %subview_1 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>> + %alloca = memref.alloca() : memref<128xf32, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.remsi %1, %c128 : index + %3 = arith.cmpi slt, %2, %c0 : index + %4 = arith.addi %2, %c128 : index + %5 = arith.select %3, %4, %2 : index + %6 = arith.cmpi slt, %5, %c125 : index + %7 = arith.select %6, %5, %c125 : index + %8 = arith.addi %5, %c1 : index + %9 = arith.cmpi slt, %8, %c125 : index + %10 = arith.select %9, %8, %c125 : index + %11 = arith.subi %10, %7 : index + %subview_3 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref> + %expand_shape_4 = memref.expand_shape %subview_3 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %subview_5 = memref.subview %expand_shape_2[0, %7] [1, %11] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref> + %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref> into memref<1x?xf32, strided<[?, 1], offset: ?>> + %12 = arith.cmpi ugt, %11, %c0 : index + %13:2 = scf.if %12 -> (f16, f32) { + %24 = memref.load %expand_shape_4[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + %25 = memref.load %expand_shape_6[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>> + scf.yield %24, %25 : f16, f32 + } else { + scf.yield %cst_0, %cst : f16, f32 + } + %14 = arith.extf %13#0 : f16 to f32 + %15 = arith.mulf %14, %13#1 : f32 + %16 = arith.addf %15, %cst : f32 + memref.store %16, %alloca[%1] : memref<128xf32, #gpu.address_space> + gpu.barrier + %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space> + %17 = arith.cmpi ult, %1, %c64 : index + scf.if %17 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca[%24] : memref<128xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca[%27] : memref<128xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_7[%1] : memref<64xf32, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space> + %18 = arith.cmpi ult, %1, %c32 : index + scf.if %18 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_7[%24] : memref<64xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_7[%27] : memref<64xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_8[%1] : memref<32xf32, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space> + %19 = arith.cmpi ult, %1, %c16 : index + scf.if %19 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_8[%24] : memref<32xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_8[%27] : memref<32xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_9[%1] : memref<16xf32, #gpu.address_space> + } + gpu.barrier + %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space> + %20 = arith.cmpi ult, %1, %c8 : index + scf.if %20 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_9[%24] : memref<16xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_9[%27] : memref<16xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_10[%1] : memref<8xf32, #gpu.address_space> + } + gpu.barrier + %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space> + %21 = arith.cmpi ult, %1, %c4 : index + scf.if %21 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_10[%24] : memref<8xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_10[%27] : memref<8xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_11[%1] : memref<4xf32, #gpu.address_space> + } + gpu.barrier + %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space> + %22 = arith.cmpi ult, %1, %c2 : index + scf.if %22 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_11[%24] : memref<4xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_11[%27] : memref<4xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_12[%1] : memref<2xf32, #gpu.address_space> + } + gpu.barrier + %23 = arith.cmpi ult, %1, %c1 : index + scf.if %23 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_12[%24] : memref<2xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_12[%27] : memref<2xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %arg2[%0] : memref<32xf32> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown147_kernel_0(%arg0: memref<32xf32>, %arg1: memref) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c16 = arith.constant 16 : index + %cst = arith.constant 0.000000e+00 : f32 + %c32 = arith.constant 32 : index + %0 = gpu.block_id x + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %0, %c32 : index + %3 = arith.addi %2, %1 : index + %4 = memref.load %arg0[%3] : memref<32xf32> + %5 = arith.addf %4, %cst : f32 + memref.store %5, %alloca[%1] : memref<32xf32, #gpu.address_space> + gpu.barrier + %alloca_0 = memref.alloca() : memref<16xf32, #gpu.address_space> + %6 = arith.cmpi ult, %1, %c16 : index + scf.if %6 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca[%11] : memref<32xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca[%14] : memref<32xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_0[%1] : memref<16xf32, #gpu.address_space> + } + gpu.barrier + %alloca_1 = memref.alloca() : memref<8xf32, #gpu.address_space> + %7 = arith.cmpi ult, %1, %c8 : index + scf.if %7 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_0[%11] : memref<16xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_0[%14] : memref<16xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_1[%1] : memref<8xf32, #gpu.address_space> + } + gpu.barrier + %alloca_2 = memref.alloca() : memref<4xf32, #gpu.address_space> + %8 = arith.cmpi ult, %1, %c4 : index + scf.if %8 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_1[%11] : memref<8xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_1[%14] : memref<8xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_2[%1] : memref<4xf32, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<2xf32, #gpu.address_space> + %9 = arith.cmpi ult, %1, %c2 : index + scf.if %9 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_2[%11] : memref<4xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_2[%14] : memref<4xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_3[%1] : memref<2xf32, #gpu.address_space> + } + gpu.barrier + %10 = arith.cmpi ult, %1, %c1 : index + scf.if %10 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_3[%11] : memref<2xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_3[%14] : memref<2xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %arg1[] : memref + } + gpu.barrier + gpu.return + } + gpu.func @Unknown171_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<1000xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1000 = arith.constant 1000 : index + %c-32 = arith.constant -32 : index + %0 = gpu.block_id x + %1 = arith.muli %0, %c-32 : index + %2 = arith.addi %1, %c1000 : index + %3 = arith.cmpi slt, %2, %c32 : index + %4 = arith.select %3, %2, %c32 : index + %5 = arith.muli %0, %c32 : index + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space> + %6 = gpu.thread_id x + %7 = gpu.thread_id y + %8 = arith.cmpi slt, %4, %6 : index + %9 = arith.select %8, %4, %6 : index + %10 = arith.addi %6, %c1 : index + %11 = arith.cmpi slt, %4, %10 : index + %12 = arith.select %11, %4, %10 : index + %13 = arith.subi %12, %9 : index + %14 = arith.cmpi ugt, %13, %c0 : index + %15 = scf.if %14 -> (f16) { + %22 = arith.muli %7, %c2 : index + %23 = arith.addi %5, %9 : index + %24 = memref.load %arg0[%22, %23] : memref<4x1000xf16> + scf.yield %24 : f16 + } else { + scf.yield %cst_0 : f16 + } + %16 = arith.extf %15 : f16 to f32 + %17 = arith.addf %16, %cst : f32 + %18 = scf.if %14 -> (f16) { + %22 = arith.muli %7, %c2 : index + %23 = arith.addi %22, %c1 : index + %24 = arith.addi %5, %9 : index + %25 = memref.load %arg0[%23, %24] : memref<4x1000xf16> + scf.yield %25 : f16 + } else { + scf.yield %cst_0 : f16 + } + %19 = arith.extf %18 : f16 to f32 + %20 = arith.addf %17, %19 : f32 + memref.store %20, %alloca_1[%7, %6] : memref<2x32xf32, #gpu.address_space> + gpu.barrier + %21 = arith.cmpi ult, %7, %c1 : index + scf.if %21 { + %22 = memref.load %alloca_1[%c0, %6] : memref<2x32xf32, #gpu.address_space> + %23 = arith.addf %22, %cst : f32 + %24 = memref.load %alloca_1[%c1, %6] : memref<2x32xf32, #gpu.address_space> + %25 = arith.addf %24, %23 : f32 + memref.store %25, %alloca[%6] : memref<32xf32, #gpu.address_space> + } + gpu.barrier + %subview = memref.subview %alloca[0] [%4] [1] : memref<32xf32, #gpu.address_space> to memref, #gpu.address_space> + %subview_2 = memref.subview %arg1[%5] [%4] [1] : memref<1000xf32> to memref> + memref.copy %subview, %subview_2 : memref, #gpu.address_space> to memref> gpu.return } } - func.func private @Unknown0(memref<4x3x224x224xf32, "cuda">) -> memref<4x3x224x224xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4704 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown1(memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown3(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown4(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown5(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown5", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown6(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown7(memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown8(memref<128x64x3x3xf32, "cuda">) -> memref<128x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown9(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown10(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown10", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown11(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown11", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown12(memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown13(memref<256x128x3x3xf32, "cuda">) -> memref<256x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown14(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown14", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown15(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown15", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown16(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown17(memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown17", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown18(memref<512x256x3x3xf32, "cuda">) -> memref<512x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown19(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown20(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown21(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown22(memref<4x1000xf32, "cuda">) -> memref<4x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown22", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown23(memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown24(memref<4x64x112x112xf16, "cuda">) -> (memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 25088 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown26(memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown28(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown28", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown30(memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown32(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown35(memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown37(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown39(memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown39", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown41(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown41", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown44(memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown46(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown48(memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown50(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown50", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown53(memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown53", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown55(memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown57(memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown59(memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown60(memref<4x512xf16, "cuda">) -> memref<4x512xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 16 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown61(memref<1000xf32, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown62(memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown63(memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf32, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32, 4 : i32, 5 : i32, 6 : i32, 7 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown64(memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown68(memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown72(memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown76(memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown76", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown83(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown87(memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown91(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown95(memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown102(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown102", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown106(memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown106", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown110(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown110", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown114(memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown114", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown121(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown121", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown125(memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown125", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown129(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown129", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown133(memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown133", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown137(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown137", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown138(memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">) -> memref<4x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 25088 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown138", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown141(memref) -> memref attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [0 : i32, 0 : i32], __byre__kernel_name = "Unknown141", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown142(memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown142", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown143(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown143", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown144(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown144", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown145(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown145", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown146(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown146", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown147(memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown147", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown148(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown148", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown149(memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown149", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown150(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown150", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown151(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown151", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown152(memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown152", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown153(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown153", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown154(memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown154", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown155(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown155", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown156(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown156", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown157(memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown157", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown158(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown158", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown159(memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown159", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown160(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown160", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown161(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown161", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown163(memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown163", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} - func.func private @Unknown164(memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown164", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown0(memref<4x3x224x224xf32, "cuda">) -> memref<4x3x224x224xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 588 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown1(memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown3(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown7(memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown8(memref<128x64x3x3xf32, "cuda">) -> memref<128x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown9(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown12(memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown13(memref<256x128x3x3xf32, "cuda">) -> memref<256x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown14(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown14", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown17(memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown17", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown18(memref<512x256x3x3xf32, "cuda">) -> memref<512x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown19(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown22(memref<4x1000xf32, "cuda">) -> memref<4x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown22", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown23(memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown24(memref<1000xf32, "cuda">) -> memref<1000xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown26(memref<4x64x112x112xf16, "cuda">) -> (memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown28(memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown28", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown30(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown37(memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown39(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown39", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown46(memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown48(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown55(memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown57(memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown63(memref<4x512xf16, "cuda">) -> memref<4x512xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown64(memref<1000xf16, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown66(memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown66", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown68(memref<4xf16, "cuda">) -> memref<4xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown69(memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32, 4 : i32, 5 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown70(memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown70", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown74(memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown78(memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown89(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown93(memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown108(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown108", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown112(memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown112", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown127(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown127", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown131(memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown131", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown143(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown143", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown144(memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">) -> memref<4x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown144", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown148(memref) -> memref attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [0 : i32, 0 : i32], __byre__kernel_name = "Unknown148", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown149(memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown149", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown150(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown150", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown154(memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown154", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown155(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown155", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown156(memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown156", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown159(memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown159", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown160(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown160", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown161(memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown161", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown164(memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown164", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown165(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown165", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown166(memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown166", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown170(memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown170", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} + func.func private @Unknown172(memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown172", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"} func.func @main(%arg0: memref<4x3x224x224xf32, "cuda">, %arg1: memref<4x1000xf32, "cuda">, %arg2: memref<64x3x7x7xf32, "cuda">, %arg3: memref<64xf32, "cuda">, %arg4: memref<64xf32, "cuda">, %arg5: memref<64xf32, "cuda">, %arg6: memref<64xf32, "cuda">, %arg7: memref<64x64x3x3xf32, "cuda">, %arg8: memref<64xf32, "cuda">, %arg9: memref<64xf32, "cuda">, %arg10: memref<64xf32, "cuda">, %arg11: memref<64xf32, "cuda">, %arg12: memref<64x64x3x3xf32, "cuda">, %arg13: memref<64xf32, "cuda">, %arg14: memref<64xf32, "cuda">, %arg15: memref<64xf32, "cuda">, %arg16: memref<64xf32, "cuda">, %arg17: memref<64x64x3x3xf32, "cuda">, %arg18: memref<64xf32, "cuda">, %arg19: memref<64xf32, "cuda">, %arg20: memref<64xf32, "cuda">, %arg21: memref<64xf32, "cuda">, %arg22: memref<64x64x3x3xf32, "cuda">, %arg23: memref<64xf32, "cuda">, %arg24: memref<64xf32, "cuda">, %arg25: memref<64xf32, "cuda">, %arg26: memref<64xf32, "cuda">, %arg27: memref<128x64x3x3xf32, "cuda">, %arg28: memref<128xf32, "cuda">, %arg29: memref<128xf32, "cuda">, %arg30: memref<128xf32, "cuda">, %arg31: memref<128xf32, "cuda">, %arg32: memref<128x128x3x3xf32, "cuda">, %arg33: memref<128xf32, "cuda">, %arg34: memref<128xf32, "cuda">, %arg35: memref<128xf32, "cuda">, %arg36: memref<128xf32, "cuda">, %arg37: memref<128x64x1x1xf32, "cuda">, %arg38: memref<128xf32, "cuda">, %arg39: memref<128xf32, "cuda">, %arg40: memref<128xf32, "cuda">, %arg41: memref<128xf32, "cuda">, %arg42: memref<128x128x3x3xf32, "cuda">, %arg43: memref<128xf32, "cuda">, %arg44: memref<128xf32, "cuda">, %arg45: memref<128xf32, "cuda">, %arg46: memref<128xf32, "cuda">, %arg47: memref<128x128x3x3xf32, "cuda">, %arg48: memref<128xf32, "cuda">, %arg49: memref<128xf32, "cuda">, %arg50: memref<128xf32, "cuda">, %arg51: memref<128xf32, "cuda">, %arg52: memref<256x128x3x3xf32, "cuda">, %arg53: memref<256xf32, "cuda">, %arg54: memref<256xf32, "cuda">, %arg55: memref<256xf32, "cuda">, %arg56: memref<256xf32, "cuda">, %arg57: memref<256x256x3x3xf32, "cuda">, %arg58: memref<256xf32, "cuda">, %arg59: memref<256xf32, "cuda">, %arg60: memref<256xf32, "cuda">, %arg61: memref<256xf32, "cuda">, %arg62: memref<256x128x1x1xf32, "cuda">, %arg63: memref<256xf32, "cuda">, %arg64: memref<256xf32, "cuda">, %arg65: memref<256xf32, "cuda">, %arg66: memref<256xf32, "cuda">, %arg67: memref<256x256x3x3xf32, "cuda">, %arg68: memref<256xf32, "cuda">, %arg69: memref<256xf32, "cuda">, %arg70: memref<256xf32, "cuda">, %arg71: memref<256xf32, "cuda">, %arg72: memref<256x256x3x3xf32, "cuda">, %arg73: memref<256xf32, "cuda">, %arg74: memref<256xf32, "cuda">, %arg75: memref<256xf32, "cuda">, %arg76: memref<256xf32, "cuda">, %arg77: memref<512x256x3x3xf32, "cuda">, %arg78: memref<512xf32, "cuda">, %arg79: memref<512xf32, "cuda">, %arg80: memref<512xf32, "cuda">, %arg81: memref<512xf32, "cuda">, %arg82: memref<512x512x3x3xf32, "cuda">, %arg83: memref<512xf32, "cuda">, %arg84: memref<512xf32, "cuda">, %arg85: memref<512xf32, "cuda">, %arg86: memref<512xf32, "cuda">, %arg87: memref<512x256x1x1xf32, "cuda">, %arg88: memref<512xf32, "cuda">, %arg89: memref<512xf32, "cuda">, %arg90: memref<512xf32, "cuda">, %arg91: memref<512xf32, "cuda">, %arg92: memref<512x512x3x3xf32, "cuda">, %arg93: memref<512xf32, "cuda">, %arg94: memref<512xf32, "cuda">, %arg95: memref<512xf32, "cuda">, %arg96: memref<512xf32, "cuda">, %arg97: memref<512x512x3x3xf32, "cuda">, %arg98: memref<512xf32, "cuda">, %arg99: memref<512xf32, "cuda">, %arg100: memref<512xf32, "cuda">, %arg101: memref<512xf32, "cuda">, %arg102: memref<1000x512xf32, "cuda">, %arg103: memref<1000xf32, "cuda">) -> (memref, memref<64x3x7x7xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<1000xf32, "cuda">) attributes {__placeholder__byre.entry_point} { %0 = call @Unknown0(%arg0) : (memref<4x3x224x224xf32, "cuda">) -> memref<4x3x224x224xf16, "cuda"> %1 = call @Unknown1(%arg2) : (memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda"> @@ -4064,344 +2197,354 @@ module @IrToMhlo.2452 attributes {gpu.container_module} { %alloc_0 = memref.alloc() : memref<4x64x112x112xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc, %arg3, %arg4, %alloc_0) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda"> %2 = call @Unknown3(%arg7) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - %3 = call @Unknown4(%arg12) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - %4 = call @Unknown5(%arg17) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - %5 = call @Unknown6(%arg22) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %3 = call @Unknown3(%arg12) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %4 = call @Unknown3(%arg17) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + %5 = call @Unknown3(%arg22) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> %6 = call @Unknown7(%arg37) : (memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda"> %7 = call @Unknown8(%arg27) : (memref<128x64x3x3xf32, "cuda">) -> memref<128x64x3x3xf16, "cuda"> %8 = call @Unknown9(%arg32) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - %9 = call @Unknown10(%arg42) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - %10 = call @Unknown11(%arg47) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %9 = call @Unknown9(%arg42) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %10 = call @Unknown9(%arg47) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> %11 = call @Unknown12(%arg62) : (memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda"> %12 = call @Unknown13(%arg52) : (memref<256x128x3x3xf32, "cuda">) -> memref<256x128x3x3xf16, "cuda"> %13 = call @Unknown14(%arg57) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - %14 = call @Unknown15(%arg67) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - %15 = call @Unknown16(%arg72) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + %14 = call @Unknown14(%arg67) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + %15 = call @Unknown14(%arg72) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> %16 = call @Unknown17(%arg87) : (memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda"> %17 = call @Unknown18(%arg77) : (memref<512x256x3x3xf32, "cuda">) -> memref<512x256x3x3xf16, "cuda"> %18 = call @Unknown19(%arg82) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - %19 = call @Unknown20(%arg92) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - %20 = call @Unknown21(%arg97) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %19 = call @Unknown19(%arg92) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + %20 = call @Unknown19(%arg97) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> %21 = call @Unknown22(%arg1) : (memref<4x1000xf32, "cuda">) -> memref<4x1000xf16, "cuda"> %22 = call @Unknown23(%arg102) : (memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> + %23 = call @Unknown24(%arg103) : (memref<1000xf32, "cuda">) -> memref<1000xf16, "cuda"> %alloc_1 = memref.alloc() : memref<4xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%21, %alloc_1) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %23:2 = call @Unknown24(%alloc_0) : (memref<4x64x112x112xf16, "cuda">) -> (memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">) + byre.compute @PTXOp(%21, %alloc_1) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown25_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + %24:2 = call @Unknown26(%alloc_0) : (memref<4x64x112x112xf16, "cuda">) -> (memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">) %alloc_2 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> - byre.compute @PoolMaxOp_f16_f16(%23#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @PoolMaxOp_f16_f16(%24#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_3 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%alloc_2, %2, %alloc_3) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_4 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_3, %arg8, %arg9, %alloc_4) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %24:2 = call @Unknown26(%alloc_4) : (memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) + %25:2 = call @Unknown28(%alloc_4) : (memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) %alloc_5 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%24#0, %3, %alloc_5) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%25#0, %3, %alloc_5) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_6 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_5, %arg13, %arg14, %alloc_6) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %25:2 = call @Unknown28(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) + %26:2 = call @Unknown30(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) %alloc_7 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%25#0, %4, %alloc_7) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%26#0, %4, %alloc_7) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_8 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_7, %arg18, %arg19, %alloc_8) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %26:2 = call @Unknown30(%alloc_8) : (memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) + %27:2 = call @Unknown28(%alloc_8) : (memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) %alloc_9 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%26#0, %5, %alloc_9) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%27#0, %5, %alloc_9) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_10 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_9, %arg23, %arg24, %alloc_10) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %27:2 = call @Unknown32(%alloc_10, %25#0) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) + %28:2 = call @Unknown30(%alloc_10, %26#0) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) %alloc_11 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%27#0, %6, %alloc_11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%28#0, %6, %alloc_11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_12 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_11, %arg38, %arg39, %alloc_12) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_13 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%27#0, %7, %alloc_13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%28#0, %7, %alloc_13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_14 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_13, %arg28, %arg29, %alloc_14) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %28:2 = call @Unknown35(%alloc_14) : (memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) + %29:2 = call @Unknown37(%alloc_14) : (memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) %alloc_15 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%28#0, %8, %alloc_15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%29#0, %8, %alloc_15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_16 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_15, %arg33, %arg34, %alloc_16) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %29:2 = call @Unknown37(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) + %30:2 = call @Unknown39(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) %alloc_17 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%29#0, %9, %alloc_17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%30#0, %9, %alloc_17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_18 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_17, %arg43, %arg44, %alloc_18) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %30:2 = call @Unknown39(%alloc_18) : (memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) + %31:2 = call @Unknown37(%alloc_18) : (memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) %alloc_19 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%30#0, %10, %alloc_19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%31#0, %10, %alloc_19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_20 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_19, %arg48, %arg49, %alloc_20) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %31:2 = call @Unknown41(%alloc_20, %29#0) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) + %32:2 = call @Unknown39(%alloc_20, %30#0) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) %alloc_21 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%31#0, %11, %alloc_21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%32#0, %11, %alloc_21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_22 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_21, %arg63, %arg64, %alloc_22) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_23 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%31#0, %12, %alloc_23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%32#0, %12, %alloc_23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_24 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_23, %arg53, %arg54, %alloc_24) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %32:2 = call @Unknown44(%alloc_24) : (memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) + %33:2 = call @Unknown46(%alloc_24) : (memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) %alloc_25 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%32#0, %13, %alloc_25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%33#0, %13, %alloc_25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_26 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_25, %arg58, %arg59, %alloc_26) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %33:2 = call @Unknown46(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) + %34:2 = call @Unknown48(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) %alloc_27 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%33#0, %14, %alloc_27) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%34#0, %14, %alloc_27) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_28 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_27, %arg68, %arg69, %alloc_28) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %34:2 = call @Unknown48(%alloc_28) : (memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) + %35:2 = call @Unknown46(%alloc_28) : (memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) %alloc_29 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%34#0, %15, %alloc_29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%35#0, %15, %alloc_29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_30 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_29, %arg73, %arg74, %alloc_30) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %35:2 = call @Unknown50(%alloc_30, %33#0) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) + %36:2 = call @Unknown48(%alloc_30, %34#0) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) %alloc_31 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%35#0, %16, %alloc_31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%36#0, %16, %alloc_31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> %alloc_32 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_31, %arg88, %arg89, %alloc_32) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> %alloc_33 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%35#0, %17, %alloc_33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%36#0, %17, %alloc_33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> %alloc_34 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_33, %arg78, %arg79, %alloc_34) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %36:2 = call @Unknown53(%alloc_34) : (memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) + %37:2 = call @Unknown55(%alloc_34) : (memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) %alloc_35 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%36#0, %18, %alloc_35) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%37#0, %18, %alloc_35) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> %alloc_36 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_35, %arg83, %arg84, %alloc_36) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %37:2 = call @Unknown55(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) + %38:2 = call @Unknown57(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) %alloc_37 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%37#0, %19, %alloc_37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%38#0, %19, %alloc_37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> %alloc_38 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_37, %arg93, %arg94, %alloc_38) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %38:2 = call @Unknown57(%alloc_38) : (memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) + %39:2 = call @Unknown55(%alloc_38) : (memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) %alloc_39 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%38#0, %20, %alloc_39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%39#0, %20, %alloc_39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> %alloc_40 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_39, %arg98, %arg99, %alloc_40) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %39:2 = call @Unknown59(%alloc_40, %37#0) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) - %alloc_41 = memref.alloc() : memref<4x512xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%39#0, %alloc_41) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512xf16, "cuda"> - %40 = call @Unknown60(%alloc_41) : (memref<4x512xf16, "cuda">) -> memref<4x512xf16, "cuda"> + %40:2 = call @Unknown57(%alloc_40, %38#0) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) + %collapse_shape = memref.collapse_shape %40#0 [[0, 1], [2, 3]] : memref<4x512x7x7xf16, "cuda"> into memref<2048x49xf16, "cuda"> + %alloc_41 = memref.alloc() : memref<2048xf16, "cuda"> + byre.compute @PTXOp(%collapse_shape, %alloc_41) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 2048 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown62_kernel"} : memref<2048x49xf16, "cuda">, memref<2048xf16, "cuda"> + %expand_shape = memref.expand_shape %alloc_41 [[0, 1]] : memref<2048xf16, "cuda"> into memref<4x512xf16, "cuda"> + %41 = call @Unknown63(%expand_shape) : (memref<4x512xf16, "cuda">) -> memref<4x512xf16, "cuda"> %alloc_42 = memref.alloc() : memref<4x1000xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%40, %22, %alloc_42) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda"> - %41 = call @Unknown61(%arg103, %alloc_42) : (memref<1000xf32, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%41, %22, %alloc_42) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda"> + %42 = call @Unknown64(%23, %alloc_42) : (memref<1000xf16, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda"> %alloc_43 = memref.alloc() : memref<4xf16, "cuda"> - byre.compute @ReduceMaxOp_f16_f16(%41, %alloc_43) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %42:2 = call @Unknown62(%alloc_43, %41) : (memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">) + byre.compute @PTXOp(%42, %alloc_43) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown65_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + %43 = call @Unknown66(%alloc_43, %42) : (memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda"> %alloc_44 = memref.alloc() : memref<4xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%42#1, %alloc_44) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %43:3 = call @Unknown63(%alloc_44, %42#0, %alloc_1, %21, %arg1) : (memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf32, "cuda">) + byre.compute @PTXOp(%43, %alloc_44) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown67_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + %44 = call @Unknown68(%alloc_44) : (memref<4xf16, "cuda">) -> memref<4xf16, "cuda"> + %45:2 = call @Unknown69(%44, %43, %alloc_1, %21) : (memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">) %alloc_45 = memref.alloc() : memref<4x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%43#0, %22, %alloc_45) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda"> - %44 = call @Unknown64(%alloc_45, %39#1) : (memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%45#1, %22, %alloc_45) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda"> + %46 = call @Unknown70(%alloc_45, %40#1) : (memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> %alloc_46 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> %alloc_47 = memref.alloc() : memref<512xf32, "cuda"> %alloc_48 = memref.alloc() : memref<512xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %44, %alloc_46, %alloc_47, %alloc_48) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %46, %alloc_46, %alloc_47, %alloc_48) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> %alloc_49 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_46, %20, %alloc_49) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> %alloc_50 = memref.alloc() : memref<512x512x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - %45 = call @Unknown68(%38#1, %alloc_49) : (memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%39#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + %47 = call @Unknown74(%39#1, %alloc_49) : (memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda"> %alloc_51 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> %alloc_52 = memref.alloc() : memref<512xf32, "cuda"> %alloc_53 = memref.alloc() : memref<512xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %45, %alloc_51, %alloc_52, %alloc_53) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %47, %alloc_51, %alloc_52, %alloc_53) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> %alloc_54 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_51, %19, %alloc_54) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> %alloc_55 = memref.alloc() : memref<512x512x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - %46 = call @Unknown72(%44, %alloc_54, %37#1) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + %48 = call @Unknown78(%46, %alloc_54, %38#1) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> %alloc_56 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> %alloc_57 = memref.alloc() : memref<512xf32, "cuda"> %alloc_58 = memref.alloc() : memref<512xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %46, %alloc_56, %alloc_57, %alloc_58) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %48, %alloc_56, %alloc_57, %alloc_58) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> %alloc_59 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_56, %18, %alloc_59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> %alloc_60 = memref.alloc() : memref<512x512x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - %47 = call @Unknown76(%36#1, %alloc_59) : (memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + %49 = call @Unknown74(%37#1, %alloc_59) : (memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda"> %alloc_61 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> %alloc_62 = memref.alloc() : memref<512xf32, "cuda"> %alloc_63 = memref.alloc() : memref<512xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %47, %alloc_61, %alloc_62, %alloc_63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %49, %alloc_61, %alloc_62, %alloc_63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> %alloc_64 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_61, %17, %alloc_64) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_65 = memref.alloc() : memref<512x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> %alloc_66 = memref.alloc() : memref<4x512x7x7xf16, "cuda"> %alloc_67 = memref.alloc() : memref<512xf32, "cuda"> %alloc_68 = memref.alloc() : memref<512xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %46, %alloc_66, %alloc_67, %alloc_68) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %48, %alloc_66, %alloc_67, %alloc_68) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> %alloc_69 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_66, %16, %alloc_69) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_70 = memref.alloc() : memref<512x256x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> - %48 = call @Unknown83(%alloc_69, %alloc_64, %35#1) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> + %50 = call @Unknown89(%alloc_69, %alloc_64, %36#1) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda"> %alloc_71 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> %alloc_72 = memref.alloc() : memref<256xf32, "cuda"> %alloc_73 = memref.alloc() : memref<256xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %48, %alloc_71, %alloc_72, %alloc_73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %50, %alloc_71, %alloc_72, %alloc_73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> %alloc_74 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_71, %15, %alloc_74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_75 = memref.alloc() : memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - %49 = call @Unknown87(%34#1, %alloc_74) : (memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + %51 = call @Unknown93(%35#1, %alloc_74) : (memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda"> %alloc_76 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> %alloc_77 = memref.alloc() : memref<256xf32, "cuda"> %alloc_78 = memref.alloc() : memref<256xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %49, %alloc_76, %alloc_77, %alloc_78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %51, %alloc_76, %alloc_77, %alloc_78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> %alloc_79 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_76, %14, %alloc_79) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_80 = memref.alloc() : memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - %50 = call @Unknown91(%48, %alloc_79, %33#1) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + %52 = call @Unknown89(%50, %alloc_79, %34#1) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda"> %alloc_81 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> %alloc_82 = memref.alloc() : memref<256xf32, "cuda"> %alloc_83 = memref.alloc() : memref<256xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %50, %alloc_81, %alloc_82, %alloc_83) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %52, %alloc_81, %alloc_82, %alloc_83) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> %alloc_84 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_81, %13, %alloc_84) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> %alloc_85 = memref.alloc() : memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - %51 = call @Unknown95(%32#1, %alloc_84) : (memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + %53 = call @Unknown93(%33#1, %alloc_84) : (memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda"> %alloc_86 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> %alloc_87 = memref.alloc() : memref<256xf32, "cuda"> %alloc_88 = memref.alloc() : memref<256xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %51, %alloc_86, %alloc_87, %alloc_88) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %53, %alloc_86, %alloc_87, %alloc_88) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> %alloc_89 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_86, %12, %alloc_89) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_90 = memref.alloc() : memref<256x128x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> %alloc_91 = memref.alloc() : memref<4x256x14x14xf16, "cuda"> %alloc_92 = memref.alloc() : memref<256xf32, "cuda"> %alloc_93 = memref.alloc() : memref<256xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %50, %alloc_91, %alloc_92, %alloc_93) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %52, %alloc_91, %alloc_92, %alloc_93) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> %alloc_94 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_91, %11, %alloc_94) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_95 = memref.alloc() : memref<256x128x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> - %52 = call @Unknown102(%alloc_94, %alloc_89, %31#1) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> + %54 = call @Unknown108(%alloc_94, %alloc_89, %32#1) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda"> %alloc_96 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> %alloc_97 = memref.alloc() : memref<128xf32, "cuda"> %alloc_98 = memref.alloc() : memref<128xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %52, %alloc_96, %alloc_97, %alloc_98) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %54, %alloc_96, %alloc_97, %alloc_98) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> %alloc_99 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_96, %10, %alloc_99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_100 = memref.alloc() : memref<128x128x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %53 = call @Unknown106(%30#1, %alloc_99) : (memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> + %55 = call @Unknown112(%31#1, %alloc_99) : (memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda"> %alloc_101 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> %alloc_102 = memref.alloc() : memref<128xf32, "cuda"> %alloc_103 = memref.alloc() : memref<128xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %53, %alloc_101, %alloc_102, %alloc_103) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %55, %alloc_101, %alloc_102, %alloc_103) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> %alloc_104 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_101, %9, %alloc_104) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_105 = memref.alloc() : memref<128x128x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %54 = call @Unknown110(%52, %alloc_104, %29#1) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> + %56 = call @Unknown108(%54, %alloc_104, %30#1) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda"> %alloc_106 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> %alloc_107 = memref.alloc() : memref<128xf32, "cuda"> %alloc_108 = memref.alloc() : memref<128xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %54, %alloc_106, %alloc_107, %alloc_108) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %56, %alloc_106, %alloc_107, %alloc_108) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> %alloc_109 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_106, %8, %alloc_109) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> %alloc_110 = memref.alloc() : memref<128x128x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %55 = call @Unknown114(%28#1, %alloc_109) : (memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> + %57 = call @Unknown112(%29#1, %alloc_109) : (memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda"> %alloc_111 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> %alloc_112 = memref.alloc() : memref<128xf32, "cuda"> %alloc_113 = memref.alloc() : memref<128xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %55, %alloc_111, %alloc_112, %alloc_113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %57, %alloc_111, %alloc_112, %alloc_113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> %alloc_114 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_111, %7, %alloc_114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_115 = memref.alloc() : memref<128x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> %alloc_116 = memref.alloc() : memref<4x128x28x28xf16, "cuda"> %alloc_117 = memref.alloc() : memref<128xf32, "cuda"> %alloc_118 = memref.alloc() : memref<128xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %54, %alloc_116, %alloc_117, %alloc_118) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %56, %alloc_116, %alloc_117, %alloc_118) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> %alloc_119 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_116, %6, %alloc_119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_120 = memref.alloc() : memref<128x64x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> - %56 = call @Unknown121(%alloc_119, %alloc_114, %27#1) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> + %58 = call @Unknown127(%alloc_119, %alloc_114, %28#1) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda"> %alloc_121 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> %alloc_122 = memref.alloc() : memref<64xf32, "cuda"> %alloc_123 = memref.alloc() : memref<64xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %56, %alloc_121, %alloc_122, %alloc_123) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %58, %alloc_121, %alloc_122, %alloc_123) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> %alloc_124 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_121, %5, %alloc_124) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_125 = memref.alloc() : memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - %57 = call @Unknown125(%26#1, %alloc_124) : (memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + %59 = call @Unknown131(%27#1, %alloc_124) : (memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> %alloc_126 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> %alloc_127 = memref.alloc() : memref<64xf32, "cuda"> %alloc_128 = memref.alloc() : memref<64xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %57, %alloc_126, %alloc_127, %alloc_128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %59, %alloc_126, %alloc_127, %alloc_128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> %alloc_129 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_126, %4, %alloc_129) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_130 = memref.alloc() : memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - %58 = call @Unknown129(%56, %alloc_129, %25#1) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + %60 = call @Unknown127(%58, %alloc_129, %26#1) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda"> %alloc_131 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> %alloc_132 = memref.alloc() : memref<64xf32, "cuda"> %alloc_133 = memref.alloc() : memref<64xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %58, %alloc_131, %alloc_132, %alloc_133) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %60, %alloc_131, %alloc_132, %alloc_133) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> %alloc_134 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_131, %3, %alloc_134) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_135 = memref.alloc() : memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%24#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - %59 = call @Unknown133(%24#1, %alloc_134) : (memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + %61 = call @Unknown131(%25#1, %alloc_134) : (memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> %alloc_136 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> %alloc_137 = memref.alloc() : memref<64xf32, "cuda"> %alloc_138 = memref.alloc() : memref<64xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %59, %alloc_136, %alloc_137, %alloc_138) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %61, %alloc_136, %alloc_137, %alloc_138) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> %alloc_139 = memref.alloc() : memref<4x64x56x56xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_136, %2, %alloc_139) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> %alloc_140 = memref.alloc() : memref<64x64x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_2, %alloc_136, %alloc_140) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - %60 = call @Unknown137(%58, %alloc_139) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %62 = call @Unknown143(%60, %alloc_139) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> %alloc_141 = memref.alloc() : memref<4x64x112x112xf16, "cuda"> - byre.compute @PoolMaxGradOp_f16f16_f16(%23#0, %60, %alloc_141) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> - %61 = call @Unknown138(%23#1, %alloc_141) : (memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">) -> memref<4x64x112x112xf16, "cuda"> + byre.compute @PoolMaxGradOp_f16f16_f16(%24#0, %62, %alloc_141) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> + %63 = call @Unknown144(%24#1, %alloc_141) : (memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">) -> memref<4x64x112x112xf16, "cuda"> %alloc_142 = memref.alloc() : memref<4x64x112x112xf16, "cuda"> %alloc_143 = memref.alloc() : memref<64xf32, "cuda"> %alloc_144 = memref.alloc() : memref<64xf32, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %61, %alloc_142, %alloc_143, %alloc_144) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %63, %alloc_142, %alloc_143, %alloc_144) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> %alloc_145 = memref.alloc() : memref<64x3x7x7xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_142, %alloc_145) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> %alloc_146 = memref.alloc() : memref - byre.compute @ReduceSumOp_f32_f32(%43#1, %alloc_146) {device = "cuda", dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref - %62 = call @Unknown141(%alloc_146) : (memref) -> memref - %63 = call @Unknown142(%alloc_145) : (memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> - %64 = call @Unknown143(%alloc_140) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> - %65 = call @Unknown144(%alloc_135) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> - %66 = call @Unknown145(%alloc_130) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> - %67 = call @Unknown146(%alloc_125) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> - %68 = call @Unknown147(%alloc_115) : (memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> - %69 = call @Unknown148(%alloc_110) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> - %70 = call @Unknown149(%alloc_120) : (memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> - %71 = call @Unknown150(%alloc_105) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> - %72 = call @Unknown151(%alloc_100) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> - %73 = call @Unknown152(%alloc_90) : (memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> - %74 = call @Unknown153(%alloc_85) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> - %75 = call @Unknown154(%alloc_95) : (memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> - %76 = call @Unknown155(%alloc_80) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> - %77 = call @Unknown156(%alloc_75) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> - %78 = call @Unknown157(%alloc_65) : (memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> - %79 = call @Unknown158(%alloc_60) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> - %80 = call @Unknown159(%alloc_70) : (memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> - %81 = call @Unknown160(%alloc_55) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> - %82 = call @Unknown161(%alloc_50) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> - %alloc_147 = memref.alloc() : memref<1000x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%40, %43#0, %alloc_147) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda"> - %83 = call @Unknown163(%alloc_147) : (memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> - %alloc_148 = memref.alloc() : memref<1000xf32, "cuda"> - byre.compute @ReduceSumOp_f32_f32(%43#2, %alloc_148) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<1000xf32, "cuda"> - %84 = call @Unknown164(%alloc_148) : (memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda"> - return %62, %63, %alloc_143, %alloc_144, %64, %alloc_137, %alloc_138, %65, %alloc_132, %alloc_133, %66, %alloc_127, %alloc_128, %67, %alloc_122, %alloc_123, %68, %alloc_112, %alloc_113, %69, %alloc_107, %alloc_108, %70, %alloc_117, %alloc_118, %71, %alloc_102, %alloc_103, %72, %alloc_97, %alloc_98, %73, %alloc_87, %alloc_88, %74, %alloc_82, %alloc_83, %75, %alloc_92, %alloc_93, %76, %alloc_77, %alloc_78, %77, %alloc_72, %alloc_73, %78, %alloc_62, %alloc_63, %79, %alloc_57, %alloc_58, %80, %alloc_67, %alloc_68, %81, %alloc_52, %alloc_53, %82, %alloc_47, %alloc_48, %83, %84 : memref, memref<64x3x7x7xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<1000xf32, "cuda"> + %collapse_shape_147 = memref.collapse_shape %45#0 [[0, 1]] : memref<4x1000xf16, "cuda"> into memref<4000xf16, "cuda"> + %collapse_shape_148 = memref.collapse_shape %arg1 [[0, 1]] : memref<4x1000xf32, "cuda"> into memref<4000xf32, "cuda"> + %expand_shape_149 = memref.expand_shape %collapse_shape_147 [[0, 1]] : memref<4000xf16, "cuda"> into memref<32x125xf16, "cuda"> + %expand_shape_150 = memref.expand_shape %collapse_shape_148 [[0, 1]] : memref<4000xf32, "cuda"> into memref<32x125xf32, "cuda"> + %alloc_151 = memref.alloc() : memref<32xf32, "cuda"> + byre.compute @PTXOp(%expand_shape_149, %expand_shape_150, %alloc_151) {BlockSize.x = 128 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel"} : memref<32x125xf16, "cuda">, memref<32x125xf32, "cuda">, memref<32xf32, "cuda"> + byre.compute @PTXOp(%alloc_151, %alloc_146) {BlockSize.x = 32 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 1 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel_0"} : memref<32xf32, "cuda">, memref + %64 = call @Unknown148(%alloc_146) : (memref) -> memref + %65 = call @Unknown149(%alloc_145) : (memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> + %66 = call @Unknown150(%alloc_140) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> + %67 = call @Unknown150(%alloc_135) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> + %68 = call @Unknown150(%alloc_130) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> + %69 = call @Unknown150(%alloc_125) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> + %70 = call @Unknown154(%alloc_115) : (memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> + %71 = call @Unknown155(%alloc_110) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> + %72 = call @Unknown156(%alloc_120) : (memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> + %73 = call @Unknown155(%alloc_105) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> + %74 = call @Unknown155(%alloc_100) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> + %75 = call @Unknown159(%alloc_90) : (memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> + %76 = call @Unknown160(%alloc_85) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> + %77 = call @Unknown161(%alloc_95) : (memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> + %78 = call @Unknown160(%alloc_80) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> + %79 = call @Unknown160(%alloc_75) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> + %80 = call @Unknown164(%alloc_65) : (memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> + %81 = call @Unknown165(%alloc_60) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> + %82 = call @Unknown166(%alloc_70) : (memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> + %83 = call @Unknown165(%alloc_55) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> + %84 = call @Unknown165(%alloc_50) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> + %alloc_152 = memref.alloc() : memref<1000x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%41, %45#1, %alloc_152) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda"> + %85 = call @Unknown170(%alloc_152) : (memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> + %alloc_153 = memref.alloc() : memref<1000xf32, "cuda"> + byre.compute @PTXOp(%45#1, %alloc_153) {BlockSize.x = 32 : i32, BlockSize.y = 2 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown171_kernel"} : memref<4x1000xf16, "cuda">, memref<1000xf32, "cuda"> + %86 = call @Unknown172(%alloc_153) : (memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda"> + return %64, %65, %alloc_143, %alloc_144, %66, %alloc_137, %alloc_138, %67, %alloc_132, %alloc_133, %68, %alloc_127, %alloc_128, %69, %alloc_122, %alloc_123, %70, %alloc_112, %alloc_113, %71, %alloc_107, %alloc_108, %72, %alloc_117, %alloc_118, %73, %alloc_102, %alloc_103, %74, %alloc_97, %alloc_98, %75, %alloc_87, %alloc_88, %76, %alloc_82, %alloc_83, %77, %alloc_92, %alloc_93, %78, %alloc_77, %alloc_78, %79, %alloc_72, %alloc_73, %80, %alloc_62, %alloc_63, %81, %alloc_57, %alloc_58, %82, %alloc_67, %alloc_68, %83, %alloc_52, %alloc_53, %84, %alloc_47, %alloc_48, %85, %86 : memref, memref<64x3x7x7xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<1000xf32, "cuda"> } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/Whole/9a_byre_host.mlir b/compiler/test/E2E/ResNet18/Whole/9a_byre_host.mlir index 9840706d4..e320376ae 100644 --- a/compiler/test/E2E/ResNet18/Whole/9a_byre_host.mlir +++ b/compiler/test/E2E/ResNet18/Whole/9a_byre_host.mlir @@ -4,4295 +4,2464 @@ module @IrToMhlo.2452 attributes {byre.container_module, gpu.container_module} { gpu.module @unified { - gpu.func @Unknown164(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { + gpu.func @Unknown172(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<1000xf32> - %7 = arith.truncf %6 : f32 to f16 - %8 = arith.extf %7 : f16 to f32 - memref.store %8, %arg1[%4] : memref<1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg2] : memref<1000xf32> + %8 = arith.truncf %7 : f32 to f16 + %9 = arith.extf %8 : f16 to f32 + memref.store %9, %arg1[%arg2] : memref<1000xf32> } gpu.return } - gpu.func @Unknown163(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown170(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf32> } gpu.return } - gpu.func @Unknown161(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { + gpu.func @Unknown166(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { + %c131072 = arith.constant 131072 : index %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index + %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> } gpu.return } - gpu.func @Unknown160(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown165(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32> } gpu.return } - gpu.func @Unknown159(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c131072 = arith.constant 131072 : index + gpu.func @Unknown164(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { + %c1179648 = arith.constant 1179648 : index %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32> } gpu.return } - gpu.func @Unknown158(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { + gpu.func @Unknown161(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { + %c32768 = arith.constant 32768 : index %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index + %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> } gpu.return } - gpu.func @Unknown157(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown160(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { + %c589824 = arith.constant 589824 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32> } gpu.return } - gpu.func @Unknown156(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index + gpu.func @Unknown159(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { + %c294912 = arith.constant 294912 : index + %c128 = arith.constant 128 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32> } gpu.return } - gpu.func @Unknown155(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { + gpu.func @Unknown156(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { + %c8192 = arith.constant 8192 : index %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> } gpu.return } - gpu.func @Unknown154(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c32768 = arith.constant 32768 : index + gpu.func @Unknown155(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { + %c147456 = arith.constant 147456 : index %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32> } gpu.return } - gpu.func @Unknown153(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index + gpu.func @Unknown154(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { + %c73728 = arith.constant 73728 : index + %c64 = arith.constant 64 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32> } gpu.return } - gpu.func @Unknown152(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index + gpu.func @Unknown150(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { + %c36864 = arith.constant 36864 : index + %c64 = arith.constant 64 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32> } gpu.return } - gpu.func @Unknown151(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index + gpu.func @Unknown149(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { + %c9408 = arith.constant 9408 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32> } gpu.return } - gpu.func @Unknown150(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown148(%arg0: memref, %arg1: memref) kernel { + %c1 = arith.constant 1 : index + %cst = arith.constant 4.000000e+00 : f32 %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1 step %6 { + %7 = memref.load %arg0[] : memref + %8 = arith.negf %7 : f32 + %9 = arith.divf %8, %cst : f32 + memref.store %9, %arg1[] : memref } gpu.return } - gpu.func @Unknown149(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c8192 = arith.constant 8192 : index + gpu.func @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel { + %c3211264 = arith.constant 3211264 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index + %c112 = arith.constant 112 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c3211264 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xf16> } gpu.return } - gpu.func @Unknown148(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.addf %13, %14 : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown147(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown146(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown145(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { + %c401408 = arith.constant 401408 : index + %cst = arith.constant 0.000000e+00 : f16 + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16> } gpu.return } - gpu.func @Unknown144(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + %c401408 = arith.constant 401408 : index + %cst = arith.constant 0.000000e+00 : f16 + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xf16> } gpu.return } - gpu.func @Unknown143(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { + %c200704 = arith.constant 200704 : index + %cst = arith.constant 0.000000e+00 : f16 + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16> } gpu.return } - gpu.func @Unknown142(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index + gpu.func @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + %c200704 = arith.constant 200704 : index + %cst = arith.constant 0.000000e+00 : f16 + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xf16> } gpu.return } - gpu.func @Unknown141(%arg0: memref, %arg1: memref) kernel { - %cst = arith.constant 4.000000e+00 : f32 - %c1 = arith.constant 1 : index + gpu.func @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index + %cst = arith.constant 0.000000e+00 : f16 + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1 : index - scf.if %5 { - %6 = memref.load %arg0[] : memref - %7 = arith.negf %6 : f32 - %8 = arith.divf %7, %cst : f32 - memref.store %8, %arg1[] : memref + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel { + gpu.func @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c3211264 = arith.constant 3211264 : index - %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c3211264 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index + %cst = arith.constant 4.900000e+01 : f16 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11] : memref<4x512xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %15 = arith.divf %13, %cst : f16 + %16 = arith.select %14, %15, %cst_0 : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf16>, %arg5: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg6 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg6, %c1000 : index + %8 = arith.divsi %arg6, %c1000 : index + %9 = memref.load %arg2[%8] : memref<4xf16> + %10 = memref.load %arg0[%8] : memref<4xf16> + %11 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %12 = memref.load %arg3[%8, %7] : memref<4x1000xf16> + %13 = arith.subf %11, %10 : f16 + %14 = math.exp %13 : f16 + %15 = arith.mulf %14, %9 : f16 + %16 = arith.subf %12, %15 : f16 + memref.store %13, %arg4[%8, %7] : memref<4x1000xf16> + memref.store %16, %arg5[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown68(%arg0: memref<4xf16>, %arg1: memref<4xf16>) kernel { + %c4 = arith.constant 4 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c4 step %6 { + %7 = memref.load %arg0[%arg2] : memref<4xf16> + %8 = math.log %7 : f16 + memref.store %8, %arg1[%arg2] : memref<4xf16> } gpu.return } - gpu.func @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg3, %c1000 : index + %8 = arith.divsi %arg3, %c1000 : index + %9 = memref.load %arg0[%8] : memref<4xf16> + %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %11 = arith.subf %10, %9 : f16 + memref.store %11, %arg2[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg3, %c1000 : index + %8 = arith.divsi %arg3, %c1000 : index + %9 = memref.load %arg0[%7] : memref<1000xf16> + %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %11 = arith.addf %10, %9 : f16 + memref.store %11, %arg2[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown63(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel { + %c2048 = arith.constant 2048 : index + %cst = arith.constant 2.040100e-02 : f16 + %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2048 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<4x512xf16> + %10 = arith.mulf %9, %cst : f16 + memref.store %10, %arg1[%8, %7] : memref<4x512xf16> } gpu.return } - gpu.func @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xi1> } gpu.return } - gpu.func @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1> } gpu.return } - gpu.func @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xi1> } gpu.return } - gpu.func @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index + gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index + %cst = arith.constant 0.000000e+00 : f16 %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1> } gpu.return } - gpu.func @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { + %c401408 = arith.constant 401408 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xi1> } gpu.return } - gpu.func @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { + %c401408 = arith.constant 401408 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1> } gpu.return } - gpu.func @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xi1> } gpu.return } - gpu.func @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { + gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant 4.900000e+01 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg0[%35, %29] : memref<4x512xf16> - %38 = arith.divf %37, %cst_0 : f16 - %39 = arith.select %36, %38, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>, %arg5: memref<4x1000xf16>, %arg6: memref<4x1000xf32>, %arg7: memref<4x1000xf32>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg3[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %18 = memref.load %arg0[%15] : memref<4xf16> - %19 = memref.load %arg2[%15] : memref<4xf16> - %20 = memref.load %arg4[%15, %9] : memref<4x1000xf32> - %21 = math.log %18 : f16 - %22 = arith.subf %17, %21 : f16 - %23 = math.exp %22 : f16 - %24 = arith.mulf %23, %19 : f16 - %25 = arith.subf %16, %24 : f16 - %26 = arith.extf %22 : f16 to f32 - %27 = arith.mulf %26, %20 : f32 - %28 = arith.extf %25 : f16 to f32 - memref.store %25, %arg5[%15, %9] : memref<4x1000xf16> - memref.store %27, %arg6[%15, %9] : memref<4x1000xf32> - memref.store %28, %arg7[%15, %9] : memref<4x1000xf32> - } - gpu.return - } - gpu.func @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>, %arg3: memref<4x1000xf16>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg0[%15] : memref<4xf16> - %18 = arith.subf %16, %17 : f16 - %19 = math.exp %18 : f16 - memref.store %18, %arg2[%15, %9] : memref<4x1000xf16> - memref.store %19, %arg3[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg0[%9] : memref<1000xf32> - %18 = arith.truncf %17 : f32 to f16 - %19 = arith.addf %16, %18 : f16 - memref.store %19, %arg2[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown60(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel { - %cst = arith.constant 2.040100e-02 : f16 - %c0 = arith.constant 0 : index - %c2048 = arith.constant 2048 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2048 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<4x512xf16> - %17 = arith.mulf %16, %cst : f16 - memref.store %17, %arg1[%15, %9] : memref<4x512xf16> - } - gpu.return - } - gpu.func @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown53(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown44(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown35(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1> - } - gpu.return - } - gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - } - gpu.return - } - gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1> } gpu.return } - gpu.func @Unknown26(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { + gpu.func @Unknown26(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel { + %c3211264 = arith.constant 3211264 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c3211264 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xi1> } gpu.return } - gpu.func @Unknown24(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c3211264 = arith.constant 3211264 : index - %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown24(%arg0: memref<1000xf32>, %arg1: memref<1000xf16>) kernel { + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c3211264 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg2] : memref<1000xf32> + %8 = arith.truncf %7 : f32 to f16 + memref.store %8, %arg1[%arg2] : memref<1000xf16> } gpu.return } gpu.func @Unknown23(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { - %c0 = arith.constant 0 : index %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf16> } gpu.return } gpu.func @Unknown22(%arg0: memref<4x1000xf32>, %arg1: memref<4x1000xf16>) kernel { - %cst = arith.constant -2.500000e-01 : f32 - %c0 = arith.constant 0 : index %c4000 = arith.constant 4000 : index + %cst = arith.constant -2.500000e-01 : f32 %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<4x1000xf32> - %17 = arith.mulf %16, %cst : f32 - %18 = arith.truncf %17 : f32 to f16 - memref.store %18, %arg1[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown21(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown20(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg2, %c1000 : index + %8 = arith.divsi %arg2, %c1000 : index + %9 = memref.load %arg0[%8, %7] : memref<4x1000xf32> + %10 = arith.mulf %9, %cst : f32 + %11 = arith.truncf %10 : f32 to f16 + memref.store %11, %arg1[%8, %7] : memref<4x1000xf16> } gpu.return } gpu.func @Unknown19(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16> } gpu.return } gpu.func @Unknown18(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16> } gpu.return } gpu.func @Unknown17(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c131072 = arith.constant 131072 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - } - gpu.return - } - gpu.func @Unknown16(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown15(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> } gpu.return } gpu.func @Unknown14(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16> } gpu.return } gpu.func @Unknown13(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c294912 = arith.constant 294912 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16> } gpu.return } gpu.func @Unknown12(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c32768 = arith.constant 32768 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - } - gpu.return - } - gpu.func @Unknown11(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown10(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> } gpu.return } gpu.func @Unknown9(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16> } gpu.return } gpu.func @Unknown8(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16> } gpu.return } gpu.func @Unknown7(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> } gpu.return } - gpu.func @Unknown6(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16> } gpu.return } - gpu.func @Unknown5(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index + gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { + %c9408 = arith.constant 9408 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16> } gpu.return } - gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index + gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel { + %c602112 = arith.constant 602112 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + %c224 = arith.constant 224 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c602112 step %6 { + %7 = arith.remsi %arg2, %c224 : index + %8 = arith.divsi %arg2, %c224 : index + %9 = arith.remsi %8, %c224 : index + %10 = arith.divsi %8, %c224 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x3x224x224xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x3x224x224xf16> } gpu.return } - gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index + gpu.func @Unknown25_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %1, %c2 : index + %3 = arith.cmpi slt, %1, %c0 : index + %4 = arith.subi %c-1, %1 : index + %5 = arith.select %3, %4, %1 : index + %6 = arith.divsi %5, %c512 : index + %7 = arith.subi %c-1, %6 : index + %8 = arith.select %3, %7, %6 : index + %9 = arith.muli %8, %c-1024 : index + %10 = arith.addi %2, %9 : index + %11 = arith.cmpi slt, %10, %c1000 : index + %12 = arith.select %11, %10, %c1000 : index + %13 = arith.addi %10, %c2 : index + %14 = arith.cmpi slt, %13, %c1000 : index + %15 = arith.select %14, %13, %c1000 : index + %16 = arith.subi %15, %12 : index + %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %17 = arith.cmpi ugt, %16, %c0 : index + %18 = scf.if %17 -> (f16) { + %32 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %32 : f16 + } else { + scf.yield %cst : f16 + } + %19 = arith.addf %18, %cst : f16 + %20 = arith.cmpi ugt, %16, %c1 : index + %21 = scf.if %20 -> (f16) { + %32 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %32 : f16 + } else { + scf.yield %cst : f16 + } + %22 = arith.addf %19, %21 : f16 + memref.store %22, %alloca[%1] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %23 = arith.cmpi ult, %1, %c256 : index + scf.if %23 { + %32 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca[%34] : memref<512xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_2[%1] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %24 = arith.cmpi ult, %1, %c128 : index + scf.if %24 { + %32 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_2[%34] : memref<256xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_3[%1] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %25 = arith.cmpi ult, %1, %c64 : index + scf.if %25 { + %32 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_3[%34] : memref<128xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_4[%1] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %26 = arith.cmpi ult, %1, %c32 : index + scf.if %26 { + %32 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_4[%34] : memref<64xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_5[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %27 = arith.cmpi ult, %1, %c16 : index + scf.if %27 { + %32 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_5[%34] : memref<32xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_6[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %28 = arith.cmpi ult, %1, %c8 : index + scf.if %28 { + %32 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_6[%34] : memref<16xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_7[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %29 = arith.cmpi ult, %1, %c4 : index + scf.if %29 { + %32 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_7[%34] : memref<8xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_8[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %30 = arith.cmpi ult, %1, %c2 : index + scf.if %30 { + %32 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_8[%34] : memref<4xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_9[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %31 = arith.cmpi ult, %1, %c1 : index + scf.if %31 { + %32 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_9[%34] : memref<2xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %arg1[%0] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown62_kernel(%arg0: memref<2048x49xf16>, %arg1: memref<2048xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c49 = arith.constant 49 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index + %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.remsi %1, %c64 : index + %3 = arith.cmpi slt, %2, %c0 : index + %4 = arith.addi %2, %c64 : index + %5 = arith.select %3, %4, %2 : index + %6 = arith.cmpi slt, %5, %c49 : index + %7 = arith.select %6, %5, %c49 : index + %8 = arith.addi %5, %c1 : index + %9 = arith.cmpi slt, %8, %c49 : index + %10 = arith.select %9, %8, %c49 : index + %11 = arith.subi %10, %7 : index + %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %12 = arith.cmpi ugt, %11, %c0 : index + %13 = scf.if %12 -> (f16) { + %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %21 : f16 + } else { + scf.yield %cst : f16 + } + %14 = arith.addf %13, %cst : f16 + memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space> + %15 = arith.cmpi ult, %1, %c32 : index + scf.if %15 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space> + %16 = arith.cmpi ult, %1, %c16 : index + scf.if %16 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space> + %17 = arith.cmpi ult, %1, %c8 : index + scf.if %17 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space> + %18 = arith.cmpi ult, %1, %c4 : index + scf.if %18 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space> + %19 = arith.cmpi ult, %1, %c2 : index + scf.if %19 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %20 = arith.cmpi ult, %1, %c1 : index + scf.if %20 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %arg1[%0] : memref<2048xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown65_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16> - } - gpu.return - } - gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel { %c0 = arith.constant 0 : index - %c602112 = arith.constant 602112 : index - %c224 = arith.constant 224 : index + %c2 = arith.constant 2 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %1, %c2 : index + %3 = arith.cmpi slt, %1, %c0 : index + %4 = arith.subi %c-1, %1 : index + %5 = arith.select %3, %4, %1 : index + %6 = arith.divsi %5, %c512 : index + %7 = arith.subi %c-1, %6 : index + %8 = arith.select %3, %7, %6 : index + %9 = arith.muli %8, %c-1024 : index + %10 = arith.addi %2, %9 : index + %11 = arith.cmpi slt, %10, %c1000 : index + %12 = arith.select %11, %10, %c1000 : index + %13 = arith.addi %10, %c2 : index + %14 = arith.cmpi slt, %13, %c1000 : index + %15 = arith.select %14, %13, %c1000 : index + %16 = arith.subi %15, %12 : index + %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %17 = arith.cmpi ugt, %16, %c0 : index + %18 = scf.if %17 -> (f16) { + %31 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %31 : f16 + } else { + scf.yield %cst : f16 + } + %19 = arith.cmpi ugt, %16, %c1 : index + %20 = scf.if %19 -> (f16) { + %31 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %31 : f16 + } else { + scf.yield %cst : f16 + } + %21 = arith.maximumf %18, %20 : f16 + memref.store %21, %alloca[%1] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %22 = arith.cmpi ult, %1, %c256 : index + scf.if %22 { + %31 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca[%32] : memref<512xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_2[%1] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %23 = arith.cmpi ult, %1, %c128 : index + scf.if %23 { + %31 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_2[%32] : memref<256xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_3[%1] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %24 = arith.cmpi ult, %1, %c64 : index + scf.if %24 { + %31 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_3[%32] : memref<128xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_4[%1] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %25 = arith.cmpi ult, %1, %c32 : index + scf.if %25 { + %31 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_4[%32] : memref<64xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_5[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %26 = arith.cmpi ult, %1, %c16 : index + scf.if %26 { + %31 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_5[%32] : memref<32xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_6[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %27 = arith.cmpi ult, %1, %c8 : index + scf.if %27 { + %31 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_6[%32] : memref<16xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_7[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %28 = arith.cmpi ult, %1, %c4 : index + scf.if %28 { + %31 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_7[%32] : memref<8xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_8[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %29 = arith.cmpi ult, %1, %c2 : index + scf.if %29 { + %31 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_8[%32] : memref<4xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_9[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %30 = arith.cmpi ult, %1, %c1 : index + scf.if %30 { + %31 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_9[%32] : memref<2xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %arg1[%0] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown67_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %1, %c2 : index + %3 = arith.cmpi slt, %1, %c0 : index + %4 = arith.subi %c-1, %1 : index + %5 = arith.select %3, %4, %1 : index + %6 = arith.divsi %5, %c512 : index + %7 = arith.subi %c-1, %6 : index + %8 = arith.select %3, %7, %6 : index + %9 = arith.muli %8, %c-1024 : index + %10 = arith.addi %2, %9 : index + %11 = arith.cmpi slt, %10, %c1000 : index + %12 = arith.select %11, %10, %c1000 : index + %13 = arith.addi %10, %c2 : index + %14 = arith.cmpi slt, %13, %c1000 : index + %15 = arith.select %14, %13, %c1000 : index + %16 = arith.subi %15, %12 : index + %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %17 = arith.cmpi ugt, %16, %c0 : index + %18 = scf.if %17 -> (f16) { + %34 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %34 : f16 + } else { + scf.yield %cst : f16 + } + %19 = math.exp %18 : f16 + %20 = arith.addf %19, %cst : f16 + %21 = arith.cmpi ugt, %16, %c1 : index + %22 = scf.if %21 -> (f16) { + %34 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %34 : f16 + } else { + scf.yield %cst : f16 + } + %23 = math.exp %22 : f16 + %24 = arith.addf %20, %23 : f16 + memref.store %24, %alloca[%1] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %25 = arith.cmpi ult, %1, %c256 : index + scf.if %25 { + %34 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca[%36] : memref<512xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_2[%1] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %26 = arith.cmpi ult, %1, %c128 : index + scf.if %26 { + %34 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_2[%36] : memref<256xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_3[%1] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %27 = arith.cmpi ult, %1, %c64 : index + scf.if %27 { + %34 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_3[%36] : memref<128xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_4[%1] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %28 = arith.cmpi ult, %1, %c32 : index + scf.if %28 { + %34 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_4[%36] : memref<64xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_5[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %29 = arith.cmpi ult, %1, %c16 : index + scf.if %29 { + %34 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_5[%36] : memref<32xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_6[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %30 = arith.cmpi ult, %1, %c8 : index + scf.if %30 { + %34 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_6[%36] : memref<16xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_7[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %31 = arith.cmpi ult, %1, %c4 : index + scf.if %31 { + %34 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_7[%36] : memref<8xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_8[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %32 = arith.cmpi ult, %1, %c2 : index + scf.if %32 { + %34 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_8[%36] : memref<4xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_9[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %33 = arith.cmpi ult, %1, %c1 : index + scf.if %33 { + %34 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_9[%36] : memref<2xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %arg1[%0] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown147_kernel(%arg0: memref<32x125xf16>, %arg1: memref<32x125xf32>, %arg2: memref<32xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c2 = arith.constant 2 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c125 = arith.constant 125 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c602112 : index - scf.if %5 { - %6 = arith.remsi %4, %c224 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c224 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c224 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c224 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c224 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c224 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x3x224x224xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x3x224x224xf16> - } + %subview = memref.subview %arg0[%0, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>> + %subview_1 = memref.subview %arg1[%0, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>> + %expand_shape_2 = memref.expand_shape %subview_1 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>> + %alloca = memref.alloca() : memref<128xf32, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.remsi %1, %c128 : index + %3 = arith.cmpi slt, %2, %c0 : index + %4 = arith.addi %2, %c128 : index + %5 = arith.select %3, %4, %2 : index + %6 = arith.cmpi slt, %5, %c125 : index + %7 = arith.select %6, %5, %c125 : index + %8 = arith.addi %5, %c1 : index + %9 = arith.cmpi slt, %8, %c125 : index + %10 = arith.select %9, %8, %c125 : index + %11 = arith.subi %10, %7 : index + %subview_3 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref> + %expand_shape_4 = memref.expand_shape %subview_3 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %subview_5 = memref.subview %expand_shape_2[0, %7] [1, %11] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref> + %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref> into memref<1x?xf32, strided<[?, 1], offset: ?>> + %12 = arith.cmpi ugt, %11, %c0 : index + %13:2 = scf.if %12 -> (f16, f32) { + %24 = memref.load %expand_shape_4[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + %25 = memref.load %expand_shape_6[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>> + scf.yield %24, %25 : f16, f32 + } else { + scf.yield %cst_0, %cst : f16, f32 + } + %14 = arith.extf %13#0 : f16 to f32 + %15 = arith.mulf %14, %13#1 : f32 + %16 = arith.addf %15, %cst : f32 + memref.store %16, %alloca[%1] : memref<128xf32, #gpu.address_space> + gpu.barrier + %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space> + %17 = arith.cmpi ult, %1, %c64 : index + scf.if %17 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca[%24] : memref<128xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca[%27] : memref<128xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_7[%1] : memref<64xf32, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space> + %18 = arith.cmpi ult, %1, %c32 : index + scf.if %18 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_7[%24] : memref<64xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_7[%27] : memref<64xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_8[%1] : memref<32xf32, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space> + %19 = arith.cmpi ult, %1, %c16 : index + scf.if %19 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_8[%24] : memref<32xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_8[%27] : memref<32xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_9[%1] : memref<16xf32, #gpu.address_space> + } + gpu.barrier + %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space> + %20 = arith.cmpi ult, %1, %c8 : index + scf.if %20 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_9[%24] : memref<16xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_9[%27] : memref<16xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_10[%1] : memref<8xf32, #gpu.address_space> + } + gpu.barrier + %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space> + %21 = arith.cmpi ult, %1, %c4 : index + scf.if %21 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_10[%24] : memref<8xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_10[%27] : memref<8xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_11[%1] : memref<4xf32, #gpu.address_space> + } + gpu.barrier + %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space> + %22 = arith.cmpi ult, %1, %c2 : index + scf.if %22 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_11[%24] : memref<4xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_11[%27] : memref<4xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_12[%1] : memref<2xf32, #gpu.address_space> + } + gpu.barrier + %23 = arith.cmpi ult, %1, %c1 : index + scf.if %23 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_12[%24] : memref<2xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_12[%27] : memref<2xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %arg2[%0] : memref<32xf32> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown147_kernel_0(%arg0: memref<32xf32>, %arg1: memref) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c16 = arith.constant 16 : index + %cst = arith.constant 0.000000e+00 : f32 + %c32 = arith.constant 32 : index + %0 = gpu.block_id x + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %0, %c32 : index + %3 = arith.addi %2, %1 : index + %4 = memref.load %arg0[%3] : memref<32xf32> + %5 = arith.addf %4, %cst : f32 + memref.store %5, %alloca[%1] : memref<32xf32, #gpu.address_space> + gpu.barrier + %alloca_0 = memref.alloca() : memref<16xf32, #gpu.address_space> + %6 = arith.cmpi ult, %1, %c16 : index + scf.if %6 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca[%11] : memref<32xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca[%14] : memref<32xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_0[%1] : memref<16xf32, #gpu.address_space> + } + gpu.barrier + %alloca_1 = memref.alloca() : memref<8xf32, #gpu.address_space> + %7 = arith.cmpi ult, %1, %c8 : index + scf.if %7 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_0[%11] : memref<16xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_0[%14] : memref<16xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_1[%1] : memref<8xf32, #gpu.address_space> + } + gpu.barrier + %alloca_2 = memref.alloca() : memref<4xf32, #gpu.address_space> + %8 = arith.cmpi ult, %1, %c4 : index + scf.if %8 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_1[%11] : memref<8xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_1[%14] : memref<8xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_2[%1] : memref<4xf32, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<2xf32, #gpu.address_space> + %9 = arith.cmpi ult, %1, %c2 : index + scf.if %9 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_2[%11] : memref<4xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_2[%14] : memref<4xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_3[%1] : memref<2xf32, #gpu.address_space> + } + gpu.barrier + %10 = arith.cmpi ult, %1, %c1 : index + scf.if %10 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_3[%11] : memref<2xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_3[%14] : memref<2xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %arg1[] : memref + } + gpu.barrier + gpu.return + } + gpu.func @Unknown171_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<1000xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1000 = arith.constant 1000 : index + %c-32 = arith.constant -32 : index + %0 = gpu.block_id x + %1 = arith.muli %0, %c-32 : index + %2 = arith.addi %1, %c1000 : index + %3 = arith.cmpi slt, %2, %c32 : index + %4 = arith.select %3, %2, %c32 : index + %5 = arith.muli %0, %c32 : index + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space> + %6 = gpu.thread_id x + %7 = gpu.thread_id y + %8 = arith.cmpi slt, %4, %6 : index + %9 = arith.select %8, %4, %6 : index + %10 = arith.addi %6, %c1 : index + %11 = arith.cmpi slt, %4, %10 : index + %12 = arith.select %11, %4, %10 : index + %13 = arith.subi %12, %9 : index + %14 = arith.cmpi ugt, %13, %c0 : index + %15 = scf.if %14 -> (f16) { + %22 = arith.muli %7, %c2 : index + %23 = arith.addi %5, %9 : index + %24 = memref.load %arg0[%22, %23] : memref<4x1000xf16> + scf.yield %24 : f16 + } else { + scf.yield %cst_0 : f16 + } + %16 = arith.extf %15 : f16 to f32 + %17 = arith.addf %16, %cst : f32 + %18 = scf.if %14 -> (f16) { + %22 = arith.muli %7, %c2 : index + %23 = arith.addi %22, %c1 : index + %24 = arith.addi %5, %9 : index + %25 = memref.load %arg0[%23, %24] : memref<4x1000xf16> + scf.yield %25 : f16 + } else { + scf.yield %cst_0 : f16 + } + %19 = arith.extf %18 : f16 to f32 + %20 = arith.addf %17, %19 : f32 + memref.store %20, %alloca_1[%7, %6] : memref<2x32xf32, #gpu.address_space> + gpu.barrier + %21 = arith.cmpi ult, %7, %c1 : index + scf.if %21 { + %22 = memref.load %alloca_1[%c0, %6] : memref<2x32xf32, #gpu.address_space> + %23 = arith.addf %22, %cst : f32 + %24 = memref.load %alloca_1[%c1, %6] : memref<2x32xf32, #gpu.address_space> + %25 = arith.addf %24, %23 : f32 + memref.store %25, %alloca[%6] : memref<32xf32, #gpu.address_space> + } + gpu.barrier + %subview = memref.subview %alloca[0] [%4] [1] : memref<32xf32, #gpu.address_space> to memref, #gpu.address_space> + %subview_2 = memref.subview %arg1[%5] [%4] [1] : memref<1000xf32> to memref> + memref.copy %subview, %subview_2 : memref, #gpu.address_space> to memref> gpu.return } } func.func @main(%arg0: memref<4x3x224x224xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<4x1000xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<64xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<64xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<64xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<64xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<64xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<64xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<64xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<64xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<128xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<128xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<128xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<128xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<128xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<128xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<128xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<128xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<128xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<128xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<128xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<128xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<128xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<128xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<256xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<256xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<256xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<256xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<256xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<256xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<256xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<256xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<256xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<256xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<256xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<256xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<512xf32, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<512xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<512xf32, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<512xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<512xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<512xf32, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<512xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<512xf32, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<512xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<512xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<512xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<512xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<512xf32, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<512xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<512xf32, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<512xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<512xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<512xf32, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<1000x512xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1000xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg105: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg106: memref<64xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg107: memref<64xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg108: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg109: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg110: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg111: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg112: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg113: memref<64xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg114: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg115: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg116: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg117: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg118: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg119: memref<64xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg120: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg121: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg122: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg123: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg124: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg125: memref<128xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg126: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg127: memref<128xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg128: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg129: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg130: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg131: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg132: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg133: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg135: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg136: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg137: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg138: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg139: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg140: memref<256xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg141: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg142: memref<256xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg143: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg144: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg147: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg150: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg151: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg152: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg153: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg156: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg159: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg162: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg164: memref<512xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg165: memref<1000x512xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg166: memref<1000xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}) attributes {byre.entry_point} { - %alloc = memref.alloc() : memref<76022848xi8, "cuda"> - %0 = "byre.alias"(%alloc) {offset = 8012864 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda"> - byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 128 : i32, GridSize.x = 4704 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda"> - %1 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> - %2 = "byre.alias"(%alloc) {offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> + %alloc = memref.alloc() : memref<76533504xi8, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 75329280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda"> + byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 256 : i32, GridSize.x = 588 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%0, %1, %2) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> - %3 = "byre.alias"(%alloc) {offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> + %3 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%2, %arg3, %arg4, %3) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda"> - %4 = "byre.alias"(%alloc) {offset = 5080128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %5 = "byre.alias"(%alloc) {offset = 5006400 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %6 = "byre.alias"(%alloc) {offset = 1552384 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown5", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %7 = "byre.alias"(%alloc) {offset = 5153856 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %8 = "byre.alias"(%alloc) {offset = 4247104 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> - byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> - %9 = "byre.alias"(%alloc) {offset = 602112 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> - %10 = "byre.alias"(%alloc) {offset = 2383872 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> - %11 = "byre.alias"(%alloc) {offset = 2088960 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> - %12 = "byre.alias"(%alloc) {offset = 2678784 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown11", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> - %13 = "byre.alias"(%alloc) {offset = 4940864 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> - byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> - %14 = "byre.alias"(%alloc) {offset = 60228672 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> - %15 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> - %16 = "byre.alias"(%alloc) {offset = 18850880 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown15", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> - %17 = "byre.alias"(%alloc) {offset = 6833216 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> - %18 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> - byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> - %19 = "byre.alias"(%alloc) {offset = 21636160 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> - %20 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> - %21 = "byre.alias"(%alloc) {offset = 33432640 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> - %22 = "byre.alias"(%alloc) {offset = 28714048 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> - %23 = "byre.alias"(%alloc) {offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda"> - %24 = "byre.alias"(%alloc) {offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> - %25 = "byre.alias"(%alloc) {offset = 757568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%23, %25) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %26 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> - %27 = "byre.alias"(%alloc) {offset = 59827264 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda"> - byre.compute @PTXOp(%3, %26, %27) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda"> - %28 = "byre.alias"(%alloc) {offset = 5227584 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @PoolMaxOp_f16_f16(%26, %28) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %29 = "byre.alias"(%alloc) {offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%28, %4, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %30 = "byre.alias"(%alloc) {offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%29, %arg8, %arg9, %30) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %31 = "byre.alias"(%alloc) {offset = 17245248 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - %32 = "byre.alias"(%alloc) {offset = 301056 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%30, %31, %32) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %33 = "byre.alias"(%alloc) {offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%31, %5, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %34 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%33, %arg13, %arg14, %34) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %35 = "byre.alias"(%alloc) {offset = 501760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%34, %28, %30, %35) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %36 = "byre.alias"(%alloc) {offset = 14033984 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%30, %6, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %37 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%36, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %38 = "byre.alias"(%alloc) {offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - %39 = "byre.alias"(%alloc) {offset = 200704 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %40 = "byre.alias"(%alloc) {offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %4 = "byre.alias"(%alloc) <{offset = 5545728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %5 = "byre.alias"(%alloc) <{offset = 5361664 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %6 = "byre.alias"(%alloc) <{offset = 6283008 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %7 = "byre.alias"(%alloc) <{offset = 6209280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %8 = "byre.alias"(%alloc) <{offset = 5463808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> + byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> + %9 = "byre.alias"(%alloc) <{offset = 6557440 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> + %10 = "byre.alias"(%alloc) <{offset = 2256896 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + %11 = "byre.alias"(%alloc) <{offset = 1761280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + %12 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + %13 = "byre.alias"(%alloc) <{offset = 5480192 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> + byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> + %14 = "byre.alias"(%alloc) <{offset = 5619456 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> + %15 = "byre.alias"(%alloc) <{offset = 74149632 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + %16 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + %17 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + %18 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> + byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> + %19 = "byre.alias"(%alloc) <{offset = 23162624 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> + %20 = "byre.alias"(%alloc) <{offset = 28733184 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + %21 = "byre.alias"(%alloc) <{offset = 33451776 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + %22 = "byre.alias"(%alloc) <{offset = 38170368 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + %23 = "byre.alias"(%alloc) <{offset = 5439616 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda"> + %24 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> + %25 = "byre.alias"(%alloc) <{offset = 73993984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000xf16, "cuda"> + byre.compute @PTXOp(%arg103, %25) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf16, "cuda"> + %26 = "byre.alias"(%alloc) <{offset = 5435392 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%23, %26) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown25_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + %27 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> + %28 = "byre.alias"(%alloc) <{offset = 25521920 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda"> + byre.compute @PTXOp(%3, %27, %28) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda"> + %29 = "byre.alias"(%alloc) <{offset = 15134464 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @PoolMaxOp_f16_f16(%27, %29) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %30 = "byre.alias"(%alloc) <{offset = 16740096 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%29, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %31 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%30, %arg8, %arg9, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %32 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %33 = "byre.alias"(%alloc) <{offset = 69381888 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%31, %32, %33) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + %34 = "byre.alias"(%alloc) <{offset = 7106304 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%32, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%34, %arg13, %arg14, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %35 = "byre.alias"(%alloc) <{offset = 18345728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %36 = "byre.alias"(%alloc) <{offset = 70987520 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%31, %29, %35, %36) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + byre.compute @ConvOp_f16f16_f16(%35, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %37 = "byre.alias"(%alloc) <{offset = 44494592 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%31, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %38 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %39 = "byre.alias"(%alloc) <{offset = 57339648 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + %40 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%38, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %41 = "byre.alias"(%alloc) {offset = 10822720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - %42 = "byre.alias"(%alloc) {offset = 401408 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%37, %30, %41, %42) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %43 = "byre.alias"(%alloc) {offset = 61621312 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%41, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %44 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %41 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %42 = "byre.alias"(%alloc) <{offset = 70184704 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%41, %35, %37, %42) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + %43 = "byre.alias"(%alloc) <{offset = 58142464 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%37, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %44 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%43, %arg38, %arg39, %44) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %45 = "byre.alias"(%alloc) {offset = 70452288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%41, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %46 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %45 = "byre.alias"(%alloc) <{offset = 59748096 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%37, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %46 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%45, %arg28, %arg29, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %47 = "byre.alias"(%alloc) {offset = 71255104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %48 = "byre.alias"(%alloc) {offset = 4740160 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %49 = "byre.alias"(%alloc) {offset = 72057920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %47 = "byre.alias"(%alloc) <{offset = 60550912 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %48 = "byre.alias"(%alloc) <{offset = 3756032 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %49 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%47, %10, %49) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%49, %arg33, %arg34, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %50 = "byre.alias"(%alloc) {offset = 69649472 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %51 = "byre.alias"(%alloc) {offset = 4790336 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %52 = "byre.alias"(%alloc) {offset = 60818496 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %50 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %51 = "byre.alias"(%alloc) <{offset = 3354624 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %52 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%50, %11, %52) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%52, %arg43, %arg44, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %53 = "byre.alias"(%alloc) {offset = 57418816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %54 = "byre.alias"(%alloc) {offset = 4840512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %55 = "byre.alias"(%alloc) {offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %53 = "byre.alias"(%alloc) <{offset = 58945280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %54 = "byre.alias"(%alloc) <{offset = 2953216 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %55 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%53, %12, %55) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%55, %arg48, %arg49, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %56 = "byre.alias"(%alloc) {offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %57 = "byre.alias"(%alloc) {offset = 4890688 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %50, %56, %57) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown41", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %58 = "byre.alias"(%alloc) {offset = 2973696 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%56, %13, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %59 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%58, %arg63, %arg64, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %60 = "byre.alias"(%alloc) {offset = 59024448 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%56, %14, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %61 = "byre.alias"(%alloc) {offset = 46580800 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg53, %arg54, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %62 = "byre.alias"(%alloc) {offset = 58623040 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %63 = "byre.alias"(%alloc) {offset = 4263488 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%61, %62, %63) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %64 = "byre.alias"(%alloc) {offset = 58221632 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%62, %15, %64) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%64, %arg58, %arg59, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %65 = "byre.alias"(%alloc) {offset = 4338752 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %66 = "byre.alias"(%alloc) {offset = 4288576 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%61, %59, %65, %66) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %67 = "byre.alias"(%alloc) {offset = 3776512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%65, %16, %67) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%67, %arg68, %arg69, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %68 = "byre.alias"(%alloc) {offset = 3375104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %69 = "byre.alias"(%alloc) {offset = 4313664 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%59, %68, %69) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %70 = "byre.alias"(%alloc) {offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%68, %17, %70) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%70, %arg73, %arg74, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %71 = "byre.alias"(%alloc) {offset = 75244608 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %72 = "byre.alias"(%alloc) {offset = 4177920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%59, %65, %71, %72) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %73 = "byre.alias"(%alloc) {offset = 950272 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%71, %18, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %74 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%73, %arg88, %arg89, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %75 = "byre.alias"(%alloc) {offset = 1150976 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%71, %19, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %76 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg78, %arg79, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %77 = "byre.alias"(%alloc) {offset = 1351680 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - %78 = "byre.alias"(%alloc) {offset = 59688000 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%76, %77, %78) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown53", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - %79 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%77, %20, %79) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%79, %arg83, %arg84, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %80 = "byre.alias"(%alloc) {offset = 1626112 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - %81 = "byre.alias"(%alloc) {offset = 59700544 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%76, %74, %80, %81) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - byre.compute @ConvOp_f16f16_f16(%80, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %82 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%76, %arg93, %arg94, %82) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %83 = "byre.alias"(%alloc) {offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - %84 = "byre.alias"(%alloc) {offset = 59713088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%82, %83, %84) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - %85 = "byre.alias"(%alloc) {offset = 75646016 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%83, %22, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%85, %arg98, %arg99, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %86 = "byre.alias"(%alloc) {offset = 75846720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%74, %80, %82, %86) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - %87 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%82, %87) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512xf16, "cuda"> - %88 = "byre.alias"(%alloc) {offset = 4203008 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda"> - byre.compute @PTXOp(%87, %88) {BlockSize.x = 128 : i32, GridSize.x = 16 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda"> - %89 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%88, %24, %89) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda"> - %90 = "byre.alias"(%alloc) {offset = 25019456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @PTXOp(%arg103, %89, %90) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> - %91 = "byre.alias"(%alloc) {offset = 25027456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda"> - byre.compute @ReduceMaxOp_f16_f16(%90, %91) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %92 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @PTXOp(%91, %90, %92, %89) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> - %93 = "byre.alias"(%alloc) {offset = 42877824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%89, %93) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %94 = "byre.alias"(%alloc) {offset = 4207104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - %95 = "byre.alias"(%alloc) {offset = 4215104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda"> - %96 = "byre.alias"(%alloc) {offset = 4231104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda"> - byre.compute @PTXOp(%93, %92, %25, %23, %arg1, %94, %95, %96) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf32, "cuda"> - %97 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%94, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda"> - %98 = "byre.alias"(%alloc) {offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%97, %86, %98) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%85, %arg98, %98, %82, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%82, %22, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%83, %82, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%84, %74, %82) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%76, %arg93, %82, %74, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%74, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %99 = "byre.alias"(%alloc) {offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%80, %74, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%98, %76, %81, %83) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%79, %arg83, %83, %76, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%76, %20, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%77, %76, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%78, %85, %76) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %100 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg78, %76, %100, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %19, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%73, %arg88, %83, %100, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %18, %61) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %101 = "byre.alias"(%alloc) {offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> - %102 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%61, %59, %72, %102) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%70, %arg73, %102, %59, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %103 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %17, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %104 = "byre.alias"(%alloc) {offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%68, %59, %104) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%69, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%67, %arg68, %59, %103, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %16, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%65, %103, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%102, %59, %66, %65) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%64, %arg58, %65, %59, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %15, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%62, %59, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%63, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg53, %59, %103, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%58, %arg63, %65, %103, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %105 = "byre.alias"(%alloc) {offset = 46982208 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %106 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> - byre.compute @PTXOp(%105, %46, %57, %56) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown102", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %56, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %107 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %56 = "byre.alias"(%alloc) <{offset = 4960256 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %50, %44, %56) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %57 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%44, %13, %57) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %58 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%57, %arg63, %arg64, %58) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %59 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%44, %14, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %60 = "byre.alias"(%alloc) <{offset = 46501632 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%59, %arg53, %arg54, %60) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %61 = "byre.alias"(%alloc) <{offset = 2551808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %62 = "byre.alias"(%alloc) <{offset = 6704896 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%60, %61, %62) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + byre.compute @ConvOp_f16f16_f16(%61, %15, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %63 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg58, %arg59, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %64 = "byre.alias"(%alloc) <{offset = 4157440 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %65 = "byre.alias"(%alloc) <{offset = 6905600 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%63, %58, %64, %65) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + %66 = "byre.alias"(%alloc) <{offset = 22736640 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%64, %16, %66) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%66, %arg68, %arg69, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %67 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %68 = "byre.alias"(%alloc) <{offset = 2056192 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%63, %67, %68) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + %69 = "byre.alias"(%alloc) <{offset = 9891584 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%67, %17, %69) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%69, %arg73, %arg74, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %70 = "byre.alias"(%alloc) <{offset = 72191744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %71 = "byre.alias"(%alloc) <{offset = 294912 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%63, %64, %70, %71) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + %72 = "byre.alias"(%alloc) <{offset = 495616 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%70, %18, %72) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + %73 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%72, %arg88, %arg89, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %74 = "byre.alias"(%alloc) <{offset = 696320 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%70, %19, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + %75 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%74, %arg78, %arg79, %75) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %76 = "byre.alias"(%alloc) <{offset = 897024 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + %77 = "byre.alias"(%alloc) <{offset = 6457088 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%75, %76, %77) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + byre.compute @ConvOp_f16f16_f16(%76, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + %78 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg83, %arg84, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %79 = "byre.alias"(%alloc) <{offset = 1097728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + %80 = "byre.alias"(%alloc) <{offset = 4820992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%78, %73, %79, %80) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + %81 = "byre.alias"(%alloc) <{offset = 1298432 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%79, %21, %81) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%81, %arg93, %arg94, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %82 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + %83 = "byre.alias"(%alloc) <{offset = 6356736 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%78, %82, %83) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + %84 = "byre.alias"(%alloc) <{offset = 72593152 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%82, %22, %84) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%84, %arg98, %arg99, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %85 = "byre.alias"(%alloc) <{offset = 72793856 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%73, %79, %78, %85) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + %86 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<2048x49xf16, "cuda"> + %87 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<2048xf16, "cuda"> + byre.compute @PTXOp(%86, %87) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 2048 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown62_kernel"} : memref<2048x49xf16, "cuda">, memref<2048xf16, "cuda"> + %88 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda"> + %89 = "byre.alias"(%alloc) <{offset = 5435520 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda"> + byre.compute @PTXOp(%88, %89) {BlockSize.x = 256 : i32, GridSize.x = 2 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda"> + %90 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%89, %24, %90) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda"> + %91 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @PTXOp(%25, %90, %91) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> + %92 = "byre.alias"(%alloc) <{offset = 11529856 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%91, %92) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown65_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + byre.compute @PTXOp(%92, %91, %90) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> + %93 = "byre.alias"(%alloc) <{offset = 47111808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%90, %93) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown67_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + %94 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%93, %94) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4xf16, "cuda"> + %95 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + %96 = "byre.alias"(%alloc) <{offset = 5455744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @PTXOp(%94, %90, %26, %23, %95, %96) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> + %97 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%96, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda"> + %98 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%97, %85, %98) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%84, %arg98, %98, %78, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%78, %22, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%82, %78, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%83, %73, %78) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%81, %arg93, %78, %73, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %21, %78) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%79, %73, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%98, %78, %80, %84) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg83, %84, %73, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%76, %73, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%77, %75, %73) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%74, %arg78, %73, %75, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%75, %19, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %75, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%72, %arg88, %84, %98, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%98, %18, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %99 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %98, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> + %100 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%63, %58, %71, %100) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%69, %arg73, %100, %63, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%63, %17, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %101 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%67, %63, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%68, %58, %63) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%66, %arg68, %63, %58, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %16, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %102 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%64, %58, %102) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + %103 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%100, %63, %65, %103) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> + %104 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg58, %103, %104, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %15, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%61, %104, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%62, %58, %60) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%59, %arg53, %60, %58, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %58, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%57, %arg63, %103, %104, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %105 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %106 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %104, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> + byre.compute @PTXOp(%105, %46, %56, %44) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %44, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + %107 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %12, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %108 = "byre.alias"(%alloc) {offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %108 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%53, %46, %108) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %109 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%54, %107, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown106", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %109, %46, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %11, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %110 = "byre.alias"(%alloc) {offset = 69141568 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %46, %110) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%56, %107, %51, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown110", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %109, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %111 = "byre.alias"(%alloc) {offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%54, %107, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %46, %107, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %11, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %109 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %107, %109) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> + %110 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%44, %46, %51, %110) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %110, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %44) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %111 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%47, %46, %111) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%48, %107, %46) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown114", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %107, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %9, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %112 = "byre.alias"(%alloc) {offset = 73155648 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %112) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %109, %107, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %113 = "byre.alias"(%alloc) {offset = 47785024 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %8, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %114 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> - byre.compute @PTXOp(%113, %37, %42, %41) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown121", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> - %115 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %41, %115, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %7, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %116 = "byre.alias"(%alloc) {offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %115, %116) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%39, %37, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown125", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%36, %arg18, %115, %37, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %6, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %117 = "byre.alias"(%alloc) {offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%30, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%41, %115, %35, %36) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown129", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%33, %arg13, %36, %30, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%30, %5, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %118 = "byre.alias"(%alloc) {offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31, %30, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%32, %115, %30) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown133", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%29, %arg8, %30, %115, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %119 = "byre.alias"(%alloc) {offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%28, %115, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%36, %30, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown137", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @PoolMaxGradOp_f16f16_f16(%26, %115, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> - %120 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> - byre.compute @PTXOp(%27, %3, %120) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown138", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %120, %26, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %121 = "byre.alias"(%alloc) {offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %26, %121) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> - %122 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref - byre.compute @ReduceSumOp_f32_f32(%95, %122) {device = "cuda", dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref - byre.compute @PTXOp(%122, %arg104) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], kernel_name = "Unknown141", memory_effects = [1 : i32, 2 : i32]} : memref, memref - byre.compute @PTXOp(%121, %arg105) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown142", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> - byre.compute @PTXOp(%119, %arg108) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown143", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%118, %arg111) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown144", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%117, %arg114) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown145", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%116, %arg117) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown146", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%112, %arg120) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown147", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%114, %arg126) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> - byre.compute @PTXOp(%110, %arg129) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown151", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown152", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%17, %arg138) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown153", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> - byre.compute @PTXOp(%16, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%104, %arg147) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown157", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%21, %arg153) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown158", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%101, %arg156) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> - byre.compute @PTXOp(%99, %arg159) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - %123 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%88, %94, %123) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%123, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown163", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> - %124 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000xf32, "cuda"> - byre.compute @ReduceSumOp_f32_f32(%96, %124) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<1000xf32, "cuda"> - byre.compute @PTXOp(%124, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda"> + byre.compute @PTXOp(%48, %44, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %112 = "byre.alias"(%alloc) <{offset = 9514752 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %112, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%112, %9, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %113 = "byre.alias"(%alloc) <{offset = 56028928 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %112, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %110, %46, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + %114 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %8, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %115 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %46, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> + %116 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%114, %41, %42, %116) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %116, %37, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %117 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%39, %40, %37) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%31, %arg18, %37, %38, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%38, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %118 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%35, %38, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%116, %31, %36, %35) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%34, %arg13, %35, %31, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%31, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %119 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%32, %31, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%33, %34, %31) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%30, %arg8, %31, %34, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%34, %4, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %120 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29, %34, %120) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%35, %31, %34) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown143", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @PoolMaxGradOp_f16f16_f16(%27, %34, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%28, %3, %27) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown144", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %27, %3, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %3, %1) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> + %121 = "byre.alias"(%alloc) <{offset = 62978176 : i64}> : (memref<76533504xi8, "cuda">) -> memref + %122 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> : (memref<76533504xi8, "cuda">) -> memref<32x125xf16, "cuda"> + %123 = "byre.alias"(%arg1) <{offset = 0 : i64}> : (memref<4x1000xf32, "cuda">) -> memref<32x125xf32, "cuda"> + %124 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> : (memref<76533504xi8, "cuda">) -> memref<32xf32, "cuda"> + byre.compute @PTXOp(%122, %123, %124) {BlockSize.x = 128 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel"} : memref<32x125xf16, "cuda">, memref<32x125xf32, "cuda">, memref<32xf32, "cuda"> + byre.compute @PTXOp(%124, %121) {BlockSize.x = 32 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 1 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel_0"} : memref<32xf32, "cuda">, memref + byre.compute @PTXOp(%121, %arg104) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref, memref + byre.compute @PTXOp(%1, %arg105) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> + byre.compute @PTXOp(%120, %arg108) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%119, %arg111) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%118, %arg114) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%117, %arg117) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%113, %arg120) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%115, %arg126) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> + byre.compute @PTXOp(%109, %arg129) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%15, %arg138) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> + byre.compute @PTXOp(%102, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%101, %arg147) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%20, %arg153) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%99, %arg156) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown166", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> + byre.compute @PTXOp(%21, %arg159) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + %125 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%89, %96, %125) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%125, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown170", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> + %126 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000xf32, "cuda"> + byre.compute @PTXOp(%96, %126) {BlockSize.x = 32 : i32, BlockSize.y = 2 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown171_kernel"} : memref<4x1000xf16, "cuda">, memref<1000xf32, "cuda"> + byre.compute @PTXOp(%126, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown172", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda"> return } } \ No newline at end of file diff --git a/compiler/test/E2E/ResNet18/Whole/9b_nvvm_codegen.mlir b/compiler/test/E2E/ResNet18/Whole/9b_nvvm_codegen.mlir index 1cd7dd3b4..1e4d27090 100644 --- a/compiler/test/E2E/ResNet18/Whole/9b_nvvm_codegen.mlir +++ b/compiler/test/E2E/ResNet18/Whole/9b_nvvm_codegen.mlir @@ -1,4298 +1,2464 @@ -// RUN: byteir-opt %s -nvvm-codegen | FileCheck %s - -// CHECK-LABEL: gpu.module @unified - module @IrToMhlo.2452 attributes {byre.container_module, gpu.container_module} { gpu.module @unified { - gpu.func @Unknown164(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { + gpu.func @Unknown172(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel { %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1000 : index - scf.if %5 { - %6 = memref.load %arg0[%4] : memref<1000xf32> - %7 = arith.truncf %6 : f32 to f16 - %8 = arith.extf %7 : f16 to f32 - memref.store %8, %arg1[%4] : memref<1000xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg2] : memref<1000xf32> + %8 = arith.truncf %7 : f32 to f16 + %9 = arith.extf %8 : f16 to f32 + memref.store %9, %arg1[%arg2] : memref<1000xf32> } gpu.return } - gpu.func @Unknown163(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown170(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel { %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf32> } gpu.return } - gpu.func @Unknown161(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { + gpu.func @Unknown166(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { + %c131072 = arith.constant 131072 : index %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index + %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> } gpu.return } - gpu.func @Unknown160(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown165(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32> } gpu.return } - gpu.func @Unknown159(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c131072 = arith.constant 131072 : index + gpu.func @Unknown164(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { + %c1179648 = arith.constant 1179648 : index %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32> } gpu.return } - gpu.func @Unknown158(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel { + gpu.func @Unknown161(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { + %c32768 = arith.constant 32768 : index %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index + %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> } gpu.return } - gpu.func @Unknown157(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown160(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { + %c589824 = arith.constant 589824 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32> } gpu.return } - gpu.func @Unknown156(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index + gpu.func @Unknown159(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { + %c294912 = arith.constant 294912 : index + %c128 = arith.constant 128 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32> } gpu.return } - gpu.func @Unknown155(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { + gpu.func @Unknown156(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { + %c8192 = arith.constant 8192 : index %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> + %10 = arith.extf %9 : f16 to f32 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> } gpu.return } - gpu.func @Unknown154(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c32768 = arith.constant 32768 : index + gpu.func @Unknown155(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { + %c147456 = arith.constant 147456 : index %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32> } gpu.return } - gpu.func @Unknown153(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index + gpu.func @Unknown154(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { + %c73728 = arith.constant 73728 : index + %c64 = arith.constant 64 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32> } gpu.return } - gpu.func @Unknown152(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c294912 = arith.constant 294912 : index + gpu.func @Unknown150(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { + %c36864 = arith.constant 36864 : index + %c64 = arith.constant 64 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32> } gpu.return } - gpu.func @Unknown151(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index + gpu.func @Unknown149(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { + %c9408 = arith.constant 9408 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16> + %14 = arith.extf %13 : f16 to f32 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32> } gpu.return } - gpu.func @Unknown150(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown148(%arg0: memref, %arg1: memref) kernel { + %c1 = arith.constant 1 : index + %cst = arith.constant 4.000000e+00 : f32 %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1 step %6 { + %7 = memref.load %arg0[] : memref + %8 = arith.negf %7 : f32 + %9 = arith.divf %8, %cst : f32 + memref.store %9, %arg1[] : memref } gpu.return } - gpu.func @Unknown149(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel { - %c0 = arith.constant 0 : index - %c8192 = arith.constant 8192 : index + gpu.func @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel { + %c3211264 = arith.constant 3211264 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index + %c112 = arith.constant 112 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> - %17 = arith.extf %16 : f16 to f32 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c3211264 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xf16> } gpu.return } - gpu.func @Unknown148(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.addf %13, %14 : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown147(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown146(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index + gpu.func @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { + %c802816 = arith.constant 802816 : index + %cst = arith.constant 0.000000e+00 : f16 %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xf16> } gpu.return } - gpu.func @Unknown145(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { + %c401408 = arith.constant 401408 : index + %cst = arith.constant 0.000000e+00 : f16 + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16> } gpu.return } - gpu.func @Unknown144(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + %c401408 = arith.constant 401408 : index + %cst = arith.constant 0.000000e+00 : f16 + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xf16> } gpu.return } - gpu.func @Unknown143(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { + %c200704 = arith.constant 200704 : index + %cst = arith.constant 0.000000e+00 : f16 + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16> } gpu.return } - gpu.func @Unknown142(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index + gpu.func @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + %c200704 = arith.constant 200704 : index + %cst = arith.constant 0.000000e+00 : f16 + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16> - %37 = arith.extf %36 : f16 to f32 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xf16> } gpu.return } - gpu.func @Unknown141(%arg0: memref, %arg1: memref) kernel { - %cst = arith.constant 4.000000e+00 : f32 - %c1 = arith.constant 1 : index + gpu.func @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index + %cst = arith.constant 0.000000e+00 : f16 + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1 : index - scf.if %5 { - %6 = memref.load %arg0[] : memref - %7 = arith.negf %6 : f32 - %8 = arith.divf %7, %cst : f32 - memref.store %8, %arg1[] : memref + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %16 = arith.addf %13, %14 : f16 + %17 = arith.select %15, %16, %cst : f16 + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel { + gpu.func @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c3211264 = arith.constant 3211264 : index - %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c3211264 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = arith.select %13, %14, %cst : f16 + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel { + %c100352 = arith.constant 100352 : index + %cst = arith.constant 4.900000e+01 : f16 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11] : memref<4x512xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xi1> + %15 = arith.divf %13, %cst : f16 + %16 = arith.select %14, %15, %cst_0 : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> } gpu.return } - gpu.func @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf16>, %arg5: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg6 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg6, %c1000 : index + %8 = arith.divsi %arg6, %c1000 : index + %9 = memref.load %arg2[%8] : memref<4xf16> + %10 = memref.load %arg0[%8] : memref<4xf16> + %11 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %12 = memref.load %arg3[%8, %7] : memref<4x1000xf16> + %13 = arith.subf %11, %10 : f16 + %14 = math.exp %13 : f16 + %15 = arith.mulf %14, %9 : f16 + %16 = arith.subf %12, %15 : f16 + memref.store %13, %arg4[%8, %7] : memref<4x1000xf16> + memref.store %16, %arg5[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown68(%arg0: memref<4xf16>, %arg1: memref<4xf16>) kernel { + %c4 = arith.constant 4 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c4 step %6 { + %7 = memref.load %arg0[%arg2] : memref<4xf16> + %8 = math.log %7 : f16 + memref.store %8, %arg1[%arg2] : memref<4xf16> } gpu.return } - gpu.func @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg3, %c1000 : index + %8 = arith.divsi %arg3, %c1000 : index + %9 = memref.load %arg0[%8] : memref<4xf16> + %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %11 = arith.subf %10, %9 : f16 + memref.store %11, %arg2[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { + %c4000 = arith.constant 4000 : index + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg3, %c1000 : index + %8 = arith.divsi %arg3, %c1000 : index + %9 = memref.load %arg0[%7] : memref<1000xf16> + %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16> + %11 = arith.addf %10, %9 : f16 + memref.store %11, %arg2[%8, %7] : memref<4x1000xf16> } gpu.return } - gpu.func @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + gpu.func @Unknown63(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel { + %c2048 = arith.constant 2048 : index + %cst = arith.constant 2.040100e-02 : f16 + %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2048 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<4x512xf16> + %10 = arith.mulf %9, %cst : f16 + memref.store %10, %arg1[%8, %7] : memref<4x512xf16> } gpu.return } - gpu.func @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg4, %c7 : index + %8 = arith.divsi %arg4, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xi1> } gpu.return } - gpu.func @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { + %c100352 = arith.constant 100352 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c100352 step %6 { + %7 = arith.remsi %arg3, %c7 : index + %8 = arith.divsi %arg3, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1> } gpu.return } - gpu.func @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel { + gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { + %c200704 = arith.constant 200704 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg4, %c14 : index + %8 = arith.divsi %arg4, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xi1> } gpu.return } - gpu.func @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index + gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index + %cst = arith.constant 0.000000e+00 : f16 %c256 = arith.constant 256 : index + %c14 = arith.constant 14 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c200704 step %6 { + %7 = arith.remsi %arg3, %c14 : index + %8 = arith.divsi %arg3, %c14 : index + %9 = arith.remsi %8, %c14 : index + %10 = arith.divsi %8, %c14 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1> } gpu.return } - gpu.func @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { + %c401408 = arith.constant 401408 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg4, %c28 : index + %8 = arith.divsi %arg4, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xi1> } gpu.return } - gpu.func @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { + %c401408 = arith.constant 401408 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c128 = arith.constant 128 : index + %c28 = arith.constant 28 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c401408 step %6 { + %7 = arith.remsi %arg3, %c28 : index + %8 = arith.divsi %arg3, %c28 : index + %9 = arith.remsi %8, %c28 : index + %10 = arith.divsi %8, %c28 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1> } gpu.return } - gpu.func @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel { + gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index + %c64 = arith.constant 64 : index + %c56 = arith.constant 56 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg4 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg4, %c56 : index + %8 = arith.divsi %arg4, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %15 = arith.addf %13, %14 : f16 + %16 = arith.maximumf %15, %cst : f16 + %17 = arith.cmpf ogt, %16, %cst : f16 + memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16> + memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xi1> } gpu.return } - gpu.func @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { + gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { + %c802816 = arith.constant 802816 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %39 = arith.addf %37, %38 : f16 - %40 = arith.select %36, %39, %cst : f16 - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.select %36, %37, %cst : f16 - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant 4.900000e+01 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xi1> - %37 = memref.load %arg0[%35, %29] : memref<4x512xf16> - %38 = arith.divf %37, %cst_0 : f16 - %39 = arith.select %36, %38, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - } - gpu.return - } - gpu.func @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>, %arg5: memref<4x1000xf16>, %arg6: memref<4x1000xf32>, %arg7: memref<4x1000xf32>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg3[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %18 = memref.load %arg0[%15] : memref<4xf16> - %19 = memref.load %arg2[%15] : memref<4xf16> - %20 = memref.load %arg4[%15, %9] : memref<4x1000xf32> - %21 = math.log %18 : f16 - %22 = arith.subf %17, %21 : f16 - %23 = math.exp %22 : f16 - %24 = arith.mulf %23, %19 : f16 - %25 = arith.subf %16, %24 : f16 - %26 = arith.extf %22 : f16 to f32 - %27 = arith.mulf %26, %20 : f32 - %28 = arith.extf %25 : f16 to f32 - memref.store %25, %arg5[%15, %9] : memref<4x1000xf16> - memref.store %27, %arg6[%15, %9] : memref<4x1000xf32> - memref.store %28, %arg7[%15, %9] : memref<4x1000xf32> - } - gpu.return - } - gpu.func @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>, %arg3: memref<4x1000xf16>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg0[%15] : memref<4xf16> - %18 = arith.subf %16, %17 : f16 - %19 = math.exp %18 : f16 - memref.store %18, %arg2[%15, %9] : memref<4x1000xf16> - memref.store %19, %arg3[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel { - %c0 = arith.constant 0 : index - %c4000 = arith.constant 4000 : index - %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16> - %17 = memref.load %arg0[%9] : memref<1000xf32> - %18 = arith.truncf %17 : f32 to f16 - %19 = arith.addf %16, %18 : f16 - memref.store %19, %arg2[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown60(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel { - %cst = arith.constant 2.040100e-02 : f16 - %c0 = arith.constant 0 : index - %c2048 = arith.constant 2048 : index - %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2048 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<4x512xf16> - %17 = arith.mulf %16, %cst : f16 - memref.store %17, %arg1[%15, %9] : memref<4x512xf16> - } - gpu.return - } - gpu.func @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown53(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c100352 = arith.constant 100352 : index - %c7 = arith.constant 7 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c100352 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1> - } - gpu.return - } - gpu.func @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown44(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c200704 = arith.constant 200704 : index - %c14 = arith.constant 14 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c200704 : index - scf.if %5 { - %6 = arith.remsi %4, %c14 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c14 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c14 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c14 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c14 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c14 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1> - } - gpu.return - } - gpu.func @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown35(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c401408 = arith.constant 401408 : index - %c28 = arith.constant 28 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c401408 : index - scf.if %5 { - %6 = arith.remsi %4, %c28 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c28 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c28 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c28 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c28 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c28 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1> - } - gpu.return - } - gpu.func @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1> - } - gpu.return - } - gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> - } - gpu.return - } - gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %38 = arith.addf %36, %37 : f16 - %39 = arith.maxnumf %38, %cst : f16 - %40 = arith.cmpf ogt, %39, %cst : f16 - memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c802816 step %6 { + %7 = arith.remsi %arg3, %c56 : index + %8 = arith.divsi %arg3, %c56 : index + %9 = arith.remsi %8, %c56 : index + %10 = arith.divsi %8, %c56 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1> } gpu.return } - gpu.func @Unknown26(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel { + gpu.func @Unknown26(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel { + %c3211264 = arith.constant 3211264 : index %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c802816 = arith.constant 802816 : index - %c56 = arith.constant 56 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c112 = arith.constant 112 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c802816 : index - scf.if %5 { - %6 = arith.remsi %4, %c56 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c56 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c56 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c56 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c56 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c56 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg3 = %4 to %c3211264 step %6 { + %7 = arith.remsi %arg3, %c112 : index + %8 = arith.divsi %arg3, %c112 : index + %9 = arith.remsi %8, %c112 : index + %10 = arith.divsi %8, %c112 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xf16> + %14 = arith.maximumf %13, %cst : f16 + %15 = arith.cmpf ogt, %14, %cst : f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16> + memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xi1> } gpu.return } - gpu.func @Unknown24(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c3211264 = arith.constant 3211264 : index - %c112 = arith.constant 112 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + gpu.func @Unknown24(%arg0: memref<1000xf32>, %arg1: memref<1000xf16>) kernel { + %c1000 = arith.constant 1000 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c3211264 : index - scf.if %5 { - %6 = arith.remsi %4, %c112 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c112 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c112 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c112 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c112 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c112 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xf16> - %37 = arith.maxnumf %36, %cst : f16 - %38 = arith.cmpf ogt, %37, %cst : f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16> - memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xi1> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1000 step %6 { + %7 = memref.load %arg0[%arg2] : memref<1000xf32> + %8 = arith.truncf %7 : f32 to f16 + memref.store %8, %arg1[%arg2] : memref<1000xf16> } gpu.return } gpu.func @Unknown23(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel { - %c0 = arith.constant 0 : index %c512000 = arith.constant 512000 : index %c512 = arith.constant 512 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c512000 : index - scf.if %5 { - %6 = arith.remsi %4, %c512 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c512 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c512 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9] : memref<1000x512xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c512000 step %6 { + %7 = arith.remsi %arg2, %c512 : index + %8 = arith.divsi %arg2, %c512 : index + %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7] : memref<1000x512xf16> } gpu.return } gpu.func @Unknown22(%arg0: memref<4x1000xf32>, %arg1: memref<4x1000xf16>) kernel { - %cst = arith.constant -2.500000e-01 : f32 - %c0 = arith.constant 0 : index %c4000 = arith.constant 4000 : index + %cst = arith.constant -2.500000e-01 : f32 %c1000 = arith.constant 1000 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c4000 : index - scf.if %5 { - %6 = arith.remsi %4, %c1000 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c1000 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c1000 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9] : memref<4x1000xf32> - %17 = arith.mulf %16, %cst : f32 - %18 = arith.truncf %17 : f32 to f16 - memref.store %18, %arg1[%15, %9] : memref<4x1000xf16> - } - gpu.return - } - gpu.func @Unknown21(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> - } - gpu.return - } - gpu.func @Unknown20(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c512 = arith.constant 512 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c4000 step %6 { + %7 = arith.remsi %arg2, %c1000 : index + %8 = arith.divsi %arg2, %c1000 : index + %9 = memref.load %arg0[%8, %7] : memref<4x1000xf32> + %10 = arith.mulf %9, %cst : f32 + %11 = arith.truncf %10 : f32 to f16 + memref.store %11, %arg1[%8, %7] : memref<4x1000xf16> } gpu.return } gpu.func @Unknown19(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c2359296 = arith.constant 2359296 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c512 = arith.constant 512 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c2359296 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c512 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c512 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c512 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c2359296 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c512 : index + %12 = arith.divsi %10, %c512 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16> } gpu.return } gpu.func @Unknown18(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c1179648 = arith.constant 1179648 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c1179648 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c1179648 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16> } gpu.return } gpu.func @Unknown17(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c131072 = arith.constant 131072 : index - %c256 = arith.constant 256 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c131072 : index - scf.if %5 { - %6 = arith.remsi %4, %c256 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c256 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c256 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16> - } - gpu.return - } - gpu.func @Unknown16(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c256 = arith.constant 256 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> - } - gpu.return - } - gpu.func @Unknown15(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c131072 step %6 { + %7 = arith.remsi %arg2, %c256 : index + %8 = arith.divsi %arg2, %c256 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16> } gpu.return } gpu.func @Unknown14(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c589824 = arith.constant 589824 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c256 = arith.constant 256 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c589824 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c256 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c256 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c256 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c589824 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c256 : index + %12 = arith.divsi %10, %c256 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16> } gpu.return } gpu.func @Unknown13(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c294912 = arith.constant 294912 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c294912 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c294912 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16> } gpu.return } gpu.func @Unknown12(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c32768 = arith.constant 32768 : index - %c128 = arith.constant 128 : index - %c-1 = arith.constant -1 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c32768 : index - scf.if %5 { - %6 = arith.remsi %4, %c128 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c128 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c128 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16> - } - gpu.return - } - gpu.func @Unknown11(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c128 = arith.constant 128 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> - } - gpu.return - } - gpu.func @Unknown10(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { %c0 = arith.constant 0 : index - %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c32768 step %6 { + %7 = arith.remsi %arg2, %c128 : index + %8 = arith.divsi %arg2, %c128 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16> } gpu.return } gpu.func @Unknown9(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c147456 = arith.constant 147456 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c128 = arith.constant 128 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c147456 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c128 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c128 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c128 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c147456 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c128 : index + %12 = arith.divsi %10, %c128 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16> } gpu.return } gpu.func @Unknown8(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index %c73728 = arith.constant 73728 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c73728 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c73728 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16> } gpu.return } gpu.func @Unknown7(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel { - %c0 = arith.constant 0 : index %c8192 = arith.constant 8192 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %c-1 = arith.constant -1 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c8192 : index - scf.if %5 { - %6 = arith.remsi %4, %c64 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c64 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c64 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32> - %17 = arith.truncf %16 : f32 to f16 - memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c8192 step %6 { + %7 = arith.remsi %arg2, %c64 : index + %8 = arith.divsi %arg2, %c64 : index + %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32> + %10 = arith.truncf %9 : f32 to f16 + memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16> } gpu.return } - gpu.func @Unknown6(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index + gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index %c64 = arith.constant 64 : index + %c3 = arith.constant 3 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c36864 step %6 { + %7 = arith.remsi %arg2, %c3 : index + %8 = arith.divsi %arg2, %c3 : index + %9 = arith.remsi %8, %c3 : index + %10 = arith.divsi %8, %c3 : index + %11 = arith.remsi %10, %c64 : index + %12 = arith.divsi %10, %c64 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16> } gpu.return } - gpu.func @Unknown5(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index + gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { + %c9408 = arith.constant 9408 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + %c7 = arith.constant 7 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c9408 step %6 { + %7 = arith.remsi %arg2, %c7 : index + %8 = arith.divsi %arg2, %c7 : index + %9 = arith.remsi %8, %c7 : index + %10 = arith.divsi %8, %c7 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16> } gpu.return } - gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index + gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel { + %c602112 = arith.constant 602112 : index %c3 = arith.constant 3 : index - %c-1 = arith.constant -1 : index - %c64 = arith.constant 64 : index + %c224 = arith.constant 224 : index %0 = gpu.block_id x %1 = gpu.block_dim x %2 = gpu.thread_id x %3 = arith.muli %1, %0 : index %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> + %5 = gpu.grid_dim x + %6 = arith.muli %1, %5 : index + scf.for %arg2 = %4 to %c602112 step %6 { + %7 = arith.remsi %arg2, %c224 : index + %8 = arith.divsi %arg2, %c224 : index + %9 = arith.remsi %8, %c224 : index + %10 = arith.divsi %8, %c224 : index + %11 = arith.remsi %10, %c3 : index + %12 = arith.divsi %10, %c3 : index + %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x3x224x224xf32> + %14 = arith.truncf %13 : f32 to f16 + memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x3x224x224xf16> } gpu.return } - gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel { - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c3 = arith.constant 3 : index + gpu.func @Unknown25_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %1, %c2 : index + %3 = arith.cmpi slt, %1, %c0 : index + %4 = arith.subi %c-1, %1 : index + %5 = arith.select %3, %4, %1 : index + %6 = arith.divsi %5, %c512 : index + %7 = arith.subi %c-1, %6 : index + %8 = arith.select %3, %7, %6 : index + %9 = arith.muli %8, %c-1024 : index + %10 = arith.addi %2, %9 : index + %11 = arith.cmpi slt, %10, %c1000 : index + %12 = arith.select %11, %10, %c1000 : index + %13 = arith.addi %10, %c2 : index + %14 = arith.cmpi slt, %13, %c1000 : index + %15 = arith.select %14, %13, %c1000 : index + %16 = arith.subi %15, %12 : index + %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %17 = arith.cmpi ugt, %16, %c0 : index + %18 = scf.if %17 -> (f16) { + %32 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %32 : f16 + } else { + scf.yield %cst : f16 + } + %19 = arith.addf %18, %cst : f16 + %20 = arith.cmpi ugt, %16, %c1 : index + %21 = scf.if %20 -> (f16) { + %32 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %32 : f16 + } else { + scf.yield %cst : f16 + } + %22 = arith.addf %19, %21 : f16 + memref.store %22, %alloca[%1] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %23 = arith.cmpi ult, %1, %c256 : index + scf.if %23 { + %32 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca[%34] : memref<512xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_2[%1] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %24 = arith.cmpi ult, %1, %c128 : index + scf.if %24 { + %32 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_2[%34] : memref<256xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_3[%1] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %25 = arith.cmpi ult, %1, %c64 : index + scf.if %25 { + %32 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_3[%34] : memref<128xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_4[%1] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %26 = arith.cmpi ult, %1, %c32 : index + scf.if %26 { + %32 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_4[%34] : memref<64xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_5[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %27 = arith.cmpi ult, %1, %c16 : index + scf.if %27 { + %32 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_5[%34] : memref<32xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_6[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %28 = arith.cmpi ult, %1, %c8 : index + scf.if %28 { + %32 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_6[%34] : memref<16xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_7[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %29 = arith.cmpi ult, %1, %c4 : index + scf.if %29 { + %32 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_7[%34] : memref<8xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_8[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %30 = arith.cmpi ult, %1, %c2 : index + scf.if %30 { + %32 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_8[%34] : memref<4xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %alloca_9[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %31 = arith.cmpi ult, %1, %c1 : index + scf.if %31 { + %32 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space> + %33 = arith.addf %32, %cst : f16 + %34 = arith.addi %2, %c1 : index + %35 = memref.load %alloca_9[%34] : memref<2xf16, #gpu.address_space> + %36 = arith.addf %35, %33 : f16 + memref.store %36, %arg1[%0] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown62_kernel(%arg0: memref<2048x49xf16>, %arg1: memref<2048xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c49 = arith.constant 49 : index + %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c36864 : index - scf.if %5 { - %6 = arith.remsi %4, %c3 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c3 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c3 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c3 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c3 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c3 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c64 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c64 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c64 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16> - } - gpu.return - } - gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel { - %c0 = arith.constant 0 : index - %c9408 = arith.constant 9408 : index - %c7 = arith.constant 7 : index + %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>> + %alloca = memref.alloca() : memref<64xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.remsi %1, %c64 : index + %3 = arith.cmpi slt, %2, %c0 : index + %4 = arith.addi %2, %c64 : index + %5 = arith.select %3, %4, %2 : index + %6 = arith.cmpi slt, %5, %c49 : index + %7 = arith.select %6, %5, %c49 : index + %8 = arith.addi %5, %c1 : index + %9 = arith.cmpi slt, %8, %c49 : index + %10 = arith.select %9, %8, %c49 : index + %11 = arith.subi %10, %7 : index + %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %12 = arith.cmpi ugt, %11, %c0 : index + %13 = scf.if %12 -> (f16) { + %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %21 : f16 + } else { + scf.yield %cst : f16 + } + %14 = arith.addf %13, %cst : f16 + memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space> + %15 = arith.cmpi ult, %1, %c32 : index + scf.if %15 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space> + %16 = arith.cmpi ult, %1, %c16 : index + scf.if %16 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space> + %17 = arith.cmpi ult, %1, %c8 : index + scf.if %17 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space> + %18 = arith.cmpi ult, %1, %c4 : index + scf.if %18 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space> + %19 = arith.cmpi ult, %1, %c2 : index + scf.if %19 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %20 = arith.cmpi ult, %1, %c1 : index + scf.if %20 { + %21 = arith.muli %1, %c2 : index + %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space> + %23 = arith.addf %22, %cst : f16 + %24 = arith.addi %21, %c1 : index + %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space> + %26 = arith.addf %25, %23 : f16 + memref.store %26, %arg1[%0] : memref<2048xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown65_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index - %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c9408 : index - scf.if %5 { - %6 = arith.remsi %4, %c7 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c7 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c7 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c7 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c7 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c7 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16> - } - gpu.return - } - gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel { %c0 = arith.constant 0 : index - %c602112 = arith.constant 602112 : index - %c224 = arith.constant 224 : index + %c2 = arith.constant 2 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %1, %c2 : index + %3 = arith.cmpi slt, %1, %c0 : index + %4 = arith.subi %c-1, %1 : index + %5 = arith.select %3, %4, %1 : index + %6 = arith.divsi %5, %c512 : index + %7 = arith.subi %c-1, %6 : index + %8 = arith.select %3, %7, %6 : index + %9 = arith.muli %8, %c-1024 : index + %10 = arith.addi %2, %9 : index + %11 = arith.cmpi slt, %10, %c1000 : index + %12 = arith.select %11, %10, %c1000 : index + %13 = arith.addi %10, %c2 : index + %14 = arith.cmpi slt, %13, %c1000 : index + %15 = arith.select %14, %13, %c1000 : index + %16 = arith.subi %15, %12 : index + %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %17 = arith.cmpi ugt, %16, %c0 : index + %18 = scf.if %17 -> (f16) { + %31 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %31 : f16 + } else { + scf.yield %cst : f16 + } + %19 = arith.cmpi ugt, %16, %c1 : index + %20 = scf.if %19 -> (f16) { + %31 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %31 : f16 + } else { + scf.yield %cst : f16 + } + %21 = arith.maximumf %18, %20 : f16 + memref.store %21, %alloca[%1] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %22 = arith.cmpi ult, %1, %c256 : index + scf.if %22 { + %31 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca[%32] : memref<512xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_2[%1] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %23 = arith.cmpi ult, %1, %c128 : index + scf.if %23 { + %31 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_2[%32] : memref<256xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_3[%1] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %24 = arith.cmpi ult, %1, %c64 : index + scf.if %24 { + %31 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_3[%32] : memref<128xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_4[%1] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %25 = arith.cmpi ult, %1, %c32 : index + scf.if %25 { + %31 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_4[%32] : memref<64xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_5[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %26 = arith.cmpi ult, %1, %c16 : index + scf.if %26 { + %31 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_5[%32] : memref<32xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_6[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %27 = arith.cmpi ult, %1, %c8 : index + scf.if %27 { + %31 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_6[%32] : memref<16xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_7[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %28 = arith.cmpi ult, %1, %c4 : index + scf.if %28 { + %31 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_7[%32] : memref<8xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_8[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %29 = arith.cmpi ult, %1, %c2 : index + scf.if %29 { + %31 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_8[%32] : memref<4xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %alloca_9[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %30 = arith.cmpi ult, %1, %c1 : index + scf.if %30 { + %31 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space> + %32 = arith.addi %2, %c1 : index + %33 = memref.load %alloca_9[%32] : memref<2xf16, #gpu.address_space> + %34 = arith.maximumf %33, %31 : f16 + memref.store %34, %arg1[%0] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown67_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c256 = arith.constant 256 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f16 + %c1000 = arith.constant 1000 : index + %c-1024 = arith.constant -1024 : index + %c512 = arith.constant 512 : index %c-1 = arith.constant -1 : index - %c3 = arith.constant 3 : index + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %0 = gpu.block_id x + %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>> + %alloca = memref.alloca() : memref<512xf16, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %1, %c2 : index + %3 = arith.cmpi slt, %1, %c0 : index + %4 = arith.subi %c-1, %1 : index + %5 = arith.select %3, %4, %1 : index + %6 = arith.divsi %5, %c512 : index + %7 = arith.subi %c-1, %6 : index + %8 = arith.select %3, %7, %6 : index + %9 = arith.muli %8, %c-1024 : index + %10 = arith.addi %2, %9 : index + %11 = arith.cmpi slt, %10, %c1000 : index + %12 = arith.select %11, %10, %c1000 : index + %13 = arith.addi %10, %c2 : index + %14 = arith.cmpi slt, %13, %c1000 : index + %15 = arith.select %14, %13, %c1000 : index + %16 = arith.subi %15, %12 : index + %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref> + %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %17 = arith.cmpi ugt, %16, %c0 : index + %18 = scf.if %17 -> (f16) { + %34 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %34 : f16 + } else { + scf.yield %cst : f16 + } + %19 = math.exp %18 : f16 + %20 = arith.addf %19, %cst : f16 + %21 = arith.cmpi ugt, %16, %c1 : index + %22 = scf.if %21 -> (f16) { + %34 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>> + scf.yield %34 : f16 + } else { + scf.yield %cst : f16 + } + %23 = math.exp %22 : f16 + %24 = arith.addf %20, %23 : f16 + memref.store %24, %alloca[%1] : memref<512xf16, #gpu.address_space> + gpu.barrier + %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space> + %25 = arith.cmpi ult, %1, %c256 : index + scf.if %25 { + %34 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca[%36] : memref<512xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_2[%1] : memref<256xf16, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space> + %26 = arith.cmpi ult, %1, %c128 : index + scf.if %26 { + %34 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_2[%36] : memref<256xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_3[%1] : memref<128xf16, #gpu.address_space> + } + gpu.barrier + %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space> + %27 = arith.cmpi ult, %1, %c64 : index + scf.if %27 { + %34 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_3[%36] : memref<128xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_4[%1] : memref<64xf16, #gpu.address_space> + } + gpu.barrier + %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space> + %28 = arith.cmpi ult, %1, %c32 : index + scf.if %28 { + %34 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_4[%36] : memref<64xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_5[%1] : memref<32xf16, #gpu.address_space> + } + gpu.barrier + %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space> + %29 = arith.cmpi ult, %1, %c16 : index + scf.if %29 { + %34 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_5[%36] : memref<32xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_6[%1] : memref<16xf16, #gpu.address_space> + } + gpu.barrier + %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space> + %30 = arith.cmpi ult, %1, %c8 : index + scf.if %30 { + %34 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_6[%36] : memref<16xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_7[%1] : memref<8xf16, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space> + %31 = arith.cmpi ult, %1, %c4 : index + scf.if %31 { + %34 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_7[%36] : memref<8xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_8[%1] : memref<4xf16, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space> + %32 = arith.cmpi ult, %1, %c2 : index + scf.if %32 { + %34 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_8[%36] : memref<4xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %alloca_9[%1] : memref<2xf16, #gpu.address_space> + } + gpu.barrier + %33 = arith.cmpi ult, %1, %c1 : index + scf.if %33 { + %34 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space> + %35 = arith.addf %34, %cst : f16 + %36 = arith.addi %2, %c1 : index + %37 = memref.load %alloca_9[%36] : memref<2xf16, #gpu.address_space> + %38 = arith.addf %37, %35 : f16 + memref.store %38, %arg1[%0] : memref<4xf16> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown147_kernel(%arg0: memref<32x125xf16>, %arg1: memref<32x125xf32>, %arg2: memref<32xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c2 = arith.constant 2 : index + %c64 = arith.constant 64 : index + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c1 = arith.constant 1 : index + %c125 = arith.constant 125 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index %0 = gpu.block_id x - %1 = gpu.block_dim x - %2 = gpu.thread_id x - %3 = arith.muli %1, %0 : index - %4 = arith.addi %2, %3 : index - %5 = arith.cmpi slt, %4, %c602112 : index - scf.if %5 { - %6 = arith.remsi %4, %c224 : index - %7 = arith.cmpi slt, %6, %c0 : index - %8 = arith.addi %6, %c224 : index - %9 = arith.select %7, %8, %6 : index - %10 = arith.cmpi slt, %4, %c0 : index - %11 = arith.subi %c-1, %4 : index - %12 = arith.select %10, %11, %4 : index - %13 = arith.divsi %12, %c224 : index - %14 = arith.subi %c-1, %13 : index - %15 = arith.select %10, %14, %13 : index - %16 = arith.remsi %15, %c224 : index - %17 = arith.cmpi slt, %16, %c0 : index - %18 = arith.addi %16, %c224 : index - %19 = arith.select %17, %18, %16 : index - %20 = arith.cmpi slt, %15, %c0 : index - %21 = arith.subi %c-1, %15 : index - %22 = arith.select %20, %21, %15 : index - %23 = arith.divsi %22, %c224 : index - %24 = arith.subi %c-1, %23 : index - %25 = arith.select %20, %24, %23 : index - %26 = arith.remsi %25, %c3 : index - %27 = arith.cmpi slt, %26, %c0 : index - %28 = arith.addi %26, %c3 : index - %29 = arith.select %27, %28, %26 : index - %30 = arith.cmpi slt, %25, %c0 : index - %31 = arith.subi %c-1, %25 : index - %32 = arith.select %30, %31, %25 : index - %33 = arith.divsi %32, %c3 : index - %34 = arith.subi %c-1, %33 : index - %35 = arith.select %30, %34, %33 : index - %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x3x224x224xf32> - %37 = arith.truncf %36 : f32 to f16 - memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x3x224x224xf16> - } + %subview = memref.subview %arg0[%0, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>> + %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>> + %subview_1 = memref.subview %arg1[%0, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>> + %expand_shape_2 = memref.expand_shape %subview_1 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>> + %alloca = memref.alloca() : memref<128xf32, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.remsi %1, %c128 : index + %3 = arith.cmpi slt, %2, %c0 : index + %4 = arith.addi %2, %c128 : index + %5 = arith.select %3, %4, %2 : index + %6 = arith.cmpi slt, %5, %c125 : index + %7 = arith.select %6, %5, %c125 : index + %8 = arith.addi %5, %c1 : index + %9 = arith.cmpi slt, %8, %c125 : index + %10 = arith.select %9, %8, %c125 : index + %11 = arith.subi %10, %7 : index + %subview_3 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref> + %expand_shape_4 = memref.expand_shape %subview_3 [[0, 1]] : memref> into memref<1x?xf16, strided<[?, 1], offset: ?>> + %subview_5 = memref.subview %expand_shape_2[0, %7] [1, %11] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref> + %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref> into memref<1x?xf32, strided<[?, 1], offset: ?>> + %12 = arith.cmpi ugt, %11, %c0 : index + %13:2 = scf.if %12 -> (f16, f32) { + %24 = memref.load %expand_shape_4[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>> + %25 = memref.load %expand_shape_6[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>> + scf.yield %24, %25 : f16, f32 + } else { + scf.yield %cst_0, %cst : f16, f32 + } + %14 = arith.extf %13#0 : f16 to f32 + %15 = arith.mulf %14, %13#1 : f32 + %16 = arith.addf %15, %cst : f32 + memref.store %16, %alloca[%1] : memref<128xf32, #gpu.address_space> + gpu.barrier + %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space> + %17 = arith.cmpi ult, %1, %c64 : index + scf.if %17 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca[%24] : memref<128xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca[%27] : memref<128xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_7[%1] : memref<64xf32, #gpu.address_space> + } + gpu.barrier + %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space> + %18 = arith.cmpi ult, %1, %c32 : index + scf.if %18 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_7[%24] : memref<64xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_7[%27] : memref<64xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_8[%1] : memref<32xf32, #gpu.address_space> + } + gpu.barrier + %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space> + %19 = arith.cmpi ult, %1, %c16 : index + scf.if %19 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_8[%24] : memref<32xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_8[%27] : memref<32xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_9[%1] : memref<16xf32, #gpu.address_space> + } + gpu.barrier + %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space> + %20 = arith.cmpi ult, %1, %c8 : index + scf.if %20 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_9[%24] : memref<16xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_9[%27] : memref<16xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_10[%1] : memref<8xf32, #gpu.address_space> + } + gpu.barrier + %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space> + %21 = arith.cmpi ult, %1, %c4 : index + scf.if %21 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_10[%24] : memref<8xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_10[%27] : memref<8xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_11[%1] : memref<4xf32, #gpu.address_space> + } + gpu.barrier + %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space> + %22 = arith.cmpi ult, %1, %c2 : index + scf.if %22 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_11[%24] : memref<4xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_11[%27] : memref<4xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %alloca_12[%1] : memref<2xf32, #gpu.address_space> + } + gpu.barrier + %23 = arith.cmpi ult, %1, %c1 : index + scf.if %23 { + %24 = arith.muli %1, %c2 : index + %25 = memref.load %alloca_12[%24] : memref<2xf32, #gpu.address_space> + %26 = arith.addf %25, %cst : f32 + %27 = arith.addi %24, %c1 : index + %28 = memref.load %alloca_12[%27] : memref<2xf32, #gpu.address_space> + %29 = arith.addf %28, %26 : f32 + memref.store %29, %arg2[%0] : memref<32xf32> + } + gpu.barrier + gpu.return + } + gpu.func @Unknown147_kernel_0(%arg0: memref<32xf32>, %arg1: memref) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c16 = arith.constant 16 : index + %cst = arith.constant 0.000000e+00 : f32 + %c32 = arith.constant 32 : index + %0 = gpu.block_id x + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %1 = gpu.thread_id x + %2 = arith.muli %0, %c32 : index + %3 = arith.addi %2, %1 : index + %4 = memref.load %arg0[%3] : memref<32xf32> + %5 = arith.addf %4, %cst : f32 + memref.store %5, %alloca[%1] : memref<32xf32, #gpu.address_space> + gpu.barrier + %alloca_0 = memref.alloca() : memref<16xf32, #gpu.address_space> + %6 = arith.cmpi ult, %1, %c16 : index + scf.if %6 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca[%11] : memref<32xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca[%14] : memref<32xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_0[%1] : memref<16xf32, #gpu.address_space> + } + gpu.barrier + %alloca_1 = memref.alloca() : memref<8xf32, #gpu.address_space> + %7 = arith.cmpi ult, %1, %c8 : index + scf.if %7 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_0[%11] : memref<16xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_0[%14] : memref<16xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_1[%1] : memref<8xf32, #gpu.address_space> + } + gpu.barrier + %alloca_2 = memref.alloca() : memref<4xf32, #gpu.address_space> + %8 = arith.cmpi ult, %1, %c4 : index + scf.if %8 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_1[%11] : memref<8xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_1[%14] : memref<8xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_2[%1] : memref<4xf32, #gpu.address_space> + } + gpu.barrier + %alloca_3 = memref.alloca() : memref<2xf32, #gpu.address_space> + %9 = arith.cmpi ult, %1, %c2 : index + scf.if %9 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_2[%11] : memref<4xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_2[%14] : memref<4xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %alloca_3[%1] : memref<2xf32, #gpu.address_space> + } + gpu.barrier + %10 = arith.cmpi ult, %1, %c1 : index + scf.if %10 { + %11 = arith.muli %1, %c2 : index + %12 = memref.load %alloca_3[%11] : memref<2xf32, #gpu.address_space> + %13 = arith.addf %12, %cst : f32 + %14 = arith.addi %11, %c1 : index + %15 = memref.load %alloca_3[%14] : memref<2xf32, #gpu.address_space> + %16 = arith.addf %15, %13 : f32 + memref.store %16, %arg1[] : memref + } + gpu.barrier + gpu.return + } + gpu.func @Unknown171_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<1000xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %c2 = arith.constant 2 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1000 = arith.constant 1000 : index + %c-32 = arith.constant -32 : index + %0 = gpu.block_id x + %1 = arith.muli %0, %c-32 : index + %2 = arith.addi %1, %c1000 : index + %3 = arith.cmpi slt, %2, %c32 : index + %4 = arith.select %3, %2, %c32 : index + %5 = arith.muli %0, %c32 : index + %alloca = memref.alloca() : memref<32xf32, #gpu.address_space> + %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space> + %6 = gpu.thread_id x + %7 = gpu.thread_id y + %8 = arith.cmpi slt, %4, %6 : index + %9 = arith.select %8, %4, %6 : index + %10 = arith.addi %6, %c1 : index + %11 = arith.cmpi slt, %4, %10 : index + %12 = arith.select %11, %4, %10 : index + %13 = arith.subi %12, %9 : index + %14 = arith.cmpi ugt, %13, %c0 : index + %15 = scf.if %14 -> (f16) { + %22 = arith.muli %7, %c2 : index + %23 = arith.addi %5, %9 : index + %24 = memref.load %arg0[%22, %23] : memref<4x1000xf16> + scf.yield %24 : f16 + } else { + scf.yield %cst_0 : f16 + } + %16 = arith.extf %15 : f16 to f32 + %17 = arith.addf %16, %cst : f32 + %18 = scf.if %14 -> (f16) { + %22 = arith.muli %7, %c2 : index + %23 = arith.addi %22, %c1 : index + %24 = arith.addi %5, %9 : index + %25 = memref.load %arg0[%23, %24] : memref<4x1000xf16> + scf.yield %25 : f16 + } else { + scf.yield %cst_0 : f16 + } + %19 = arith.extf %18 : f16 to f32 + %20 = arith.addf %17, %19 : f32 + memref.store %20, %alloca_1[%7, %6] : memref<2x32xf32, #gpu.address_space> + gpu.barrier + %21 = arith.cmpi ult, %7, %c1 : index + scf.if %21 { + %22 = memref.load %alloca_1[%c0, %6] : memref<2x32xf32, #gpu.address_space> + %23 = arith.addf %22, %cst : f32 + %24 = memref.load %alloca_1[%c1, %6] : memref<2x32xf32, #gpu.address_space> + %25 = arith.addf %24, %23 : f32 + memref.store %25, %alloca[%6] : memref<32xf32, #gpu.address_space> + } + gpu.barrier + %subview = memref.subview %alloca[0] [%4] [1] : memref<32xf32, #gpu.address_space> to memref, #gpu.address_space> + %subview_2 = memref.subview %arg1[%5] [%4] [1] : memref<1000xf32> to memref> + memref.copy %subview, %subview_2 : memref, #gpu.address_space> to memref> gpu.return } } func.func @main(%arg0: memref<4x3x224x224xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<4x1000xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<64xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<64xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<64xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<64xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<64xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<64xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<64xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<64xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<128xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<128xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<128xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<128xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<128xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<128xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<128xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<128xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<128xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<128xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<128xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<128xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<128xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<128xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<256xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<256xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<256xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<256xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<256xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<256xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<256xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<256xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<256xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<256xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<256xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<256xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<512xf32, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<512xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<512xf32, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<512xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<512xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<512xf32, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<512xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<512xf32, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<512xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<512xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<512xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<512xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<512xf32, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<512xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<512xf32, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<512xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<512xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<512xf32, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<1000x512xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1000xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg105: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg106: memref<64xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg107: memref<64xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg108: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg109: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg110: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg111: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg112: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg113: memref<64xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg114: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg115: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg116: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg117: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg118: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg119: memref<64xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg120: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg121: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg122: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg123: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg124: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg125: memref<128xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg126: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg127: memref<128xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg128: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg129: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg130: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg131: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg132: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg133: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg135: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg136: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg137: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg138: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg139: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg140: memref<256xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg141: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg142: memref<256xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg143: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg144: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg147: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg150: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg151: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg152: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg153: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg156: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg159: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg162: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg164: memref<512xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg165: memref<1000x512xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg166: memref<1000xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}) attributes {byre.entry_point} { - %alloc = memref.alloc() : memref<76022848xi8, "cuda"> - %0 = "byre.alias"(%alloc) {offset = 8012864 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda"> - byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 128 : i32, GridSize.x = 4704 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda"> - %1 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> - %2 = "byre.alias"(%alloc) {offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> + %alloc = memref.alloc() : memref<76533504xi8, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 75329280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda"> + byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 256 : i32, GridSize.x = 588 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%0, %1, %2) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> - %3 = "byre.alias"(%alloc) {offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> + %3 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%2, %arg3, %arg4, %3) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda"> - %4 = "byre.alias"(%alloc) {offset = 5080128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %5 = "byre.alias"(%alloc) {offset = 5006400 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %6 = "byre.alias"(%alloc) {offset = 1552384 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown5", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %7 = "byre.alias"(%alloc) {offset = 5153856 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %8 = "byre.alias"(%alloc) {offset = 4247104 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> - byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> - %9 = "byre.alias"(%alloc) {offset = 602112 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> - %10 = "byre.alias"(%alloc) {offset = 2383872 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> - %11 = "byre.alias"(%alloc) {offset = 2088960 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> - %12 = "byre.alias"(%alloc) {offset = 2678784 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown11", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> - %13 = "byre.alias"(%alloc) {offset = 4940864 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> - byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> - %14 = "byre.alias"(%alloc) {offset = 60228672 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> - %15 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> - %16 = "byre.alias"(%alloc) {offset = 18850880 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown15", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> - %17 = "byre.alias"(%alloc) {offset = 6833216 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> - %18 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> - byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> - %19 = "byre.alias"(%alloc) {offset = 21636160 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> - %20 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> - %21 = "byre.alias"(%alloc) {offset = 33432640 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> - %22 = "byre.alias"(%alloc) {offset = 28714048 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> - %23 = "byre.alias"(%alloc) {offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda"> - %24 = "byre.alias"(%alloc) {offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> - %25 = "byre.alias"(%alloc) {offset = 757568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%23, %25) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %26 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> - %27 = "byre.alias"(%alloc) {offset = 59827264 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda"> - byre.compute @PTXOp(%3, %26, %27) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda"> - %28 = "byre.alias"(%alloc) {offset = 5227584 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @PoolMaxOp_f16_f16(%26, %28) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %29 = "byre.alias"(%alloc) {offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%28, %4, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %30 = "byre.alias"(%alloc) {offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%29, %arg8, %arg9, %30) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %31 = "byre.alias"(%alloc) {offset = 17245248 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - %32 = "byre.alias"(%alloc) {offset = 301056 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%30, %31, %32) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %33 = "byre.alias"(%alloc) {offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%31, %5, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %34 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%33, %arg13, %arg14, %34) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %35 = "byre.alias"(%alloc) {offset = 501760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%34, %28, %30, %35) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %36 = "byre.alias"(%alloc) {offset = 14033984 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%30, %6, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %37 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%36, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %38 = "byre.alias"(%alloc) {offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - %39 = "byre.alias"(%alloc) {offset = 200704 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %40 = "byre.alias"(%alloc) {offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %4 = "byre.alias"(%alloc) <{offset = 5545728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %5 = "byre.alias"(%alloc) <{offset = 5361664 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %6 = "byre.alias"(%alloc) <{offset = 6283008 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %7 = "byre.alias"(%alloc) <{offset = 6209280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %8 = "byre.alias"(%alloc) <{offset = 5463808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> + byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> + %9 = "byre.alias"(%alloc) <{offset = 6557440 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> + %10 = "byre.alias"(%alloc) <{offset = 2256896 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + %11 = "byre.alias"(%alloc) <{offset = 1761280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + %12 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + %13 = "byre.alias"(%alloc) <{offset = 5480192 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> + byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> + %14 = "byre.alias"(%alloc) <{offset = 5619456 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> + %15 = "byre.alias"(%alloc) <{offset = 74149632 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + %16 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + %17 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + %18 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> + byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> + %19 = "byre.alias"(%alloc) <{offset = 23162624 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> + %20 = "byre.alias"(%alloc) <{offset = 28733184 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + %21 = "byre.alias"(%alloc) <{offset = 33451776 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + %22 = "byre.alias"(%alloc) <{offset = 38170368 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + %23 = "byre.alias"(%alloc) <{offset = 5439616 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda"> + %24 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> + %25 = "byre.alias"(%alloc) <{offset = 73993984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000xf16, "cuda"> + byre.compute @PTXOp(%arg103, %25) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf16, "cuda"> + %26 = "byre.alias"(%alloc) <{offset = 5435392 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%23, %26) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown25_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + %27 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> + %28 = "byre.alias"(%alloc) <{offset = 25521920 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda"> + byre.compute @PTXOp(%3, %27, %28) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda"> + %29 = "byre.alias"(%alloc) <{offset = 15134464 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @PoolMaxOp_f16_f16(%27, %29) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %30 = "byre.alias"(%alloc) <{offset = 16740096 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%29, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %31 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%30, %arg8, %arg9, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %32 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %33 = "byre.alias"(%alloc) <{offset = 69381888 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%31, %32, %33) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + %34 = "byre.alias"(%alloc) <{offset = 7106304 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%32, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%34, %arg13, %arg14, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %35 = "byre.alias"(%alloc) <{offset = 18345728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %36 = "byre.alias"(%alloc) <{offset = 70987520 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%31, %29, %35, %36) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + byre.compute @ConvOp_f16f16_f16(%35, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %37 = "byre.alias"(%alloc) <{offset = 44494592 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%31, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %38 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %39 = "byre.alias"(%alloc) <{offset = 57339648 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + %40 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%38, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %41 = "byre.alias"(%alloc) {offset = 10822720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - %42 = "byre.alias"(%alloc) {offset = 401408 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%37, %30, %41, %42) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %43 = "byre.alias"(%alloc) {offset = 61621312 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%41, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %44 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %41 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %42 = "byre.alias"(%alloc) <{offset = 70184704 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%41, %35, %37, %42) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + %43 = "byre.alias"(%alloc) <{offset = 58142464 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%37, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %44 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%43, %arg38, %arg39, %44) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %45 = "byre.alias"(%alloc) {offset = 70452288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%41, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %46 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %45 = "byre.alias"(%alloc) <{offset = 59748096 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%37, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %46 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%45, %arg28, %arg29, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %47 = "byre.alias"(%alloc) {offset = 71255104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %48 = "byre.alias"(%alloc) {offset = 4740160 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %49 = "byre.alias"(%alloc) {offset = 72057920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %47 = "byre.alias"(%alloc) <{offset = 60550912 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %48 = "byre.alias"(%alloc) <{offset = 3756032 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %49 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%47, %10, %49) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%49, %arg33, %arg34, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %50 = "byre.alias"(%alloc) {offset = 69649472 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %51 = "byre.alias"(%alloc) {offset = 4790336 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %52 = "byre.alias"(%alloc) {offset = 60818496 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %50 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %51 = "byre.alias"(%alloc) <{offset = 3354624 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %52 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%50, %11, %52) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%52, %arg43, %arg44, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %53 = "byre.alias"(%alloc) {offset = 57418816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %54 = "byre.alias"(%alloc) {offset = 4840512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %55 = "byre.alias"(%alloc) {offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %53 = "byre.alias"(%alloc) <{offset = 58945280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %54 = "byre.alias"(%alloc) <{offset = 2953216 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %55 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%53, %12, %55) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%55, %arg48, %arg49, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %56 = "byre.alias"(%alloc) {offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %57 = "byre.alias"(%alloc) {offset = 4890688 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %50, %56, %57) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown41", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %58 = "byre.alias"(%alloc) {offset = 2973696 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%56, %13, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %59 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%58, %arg63, %arg64, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %60 = "byre.alias"(%alloc) {offset = 59024448 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%56, %14, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %61 = "byre.alias"(%alloc) {offset = 46580800 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg53, %arg54, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %62 = "byre.alias"(%alloc) {offset = 58623040 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %63 = "byre.alias"(%alloc) {offset = 4263488 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%61, %62, %63) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %64 = "byre.alias"(%alloc) {offset = 58221632 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%62, %15, %64) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%64, %arg58, %arg59, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %65 = "byre.alias"(%alloc) {offset = 4338752 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %66 = "byre.alias"(%alloc) {offset = 4288576 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%61, %59, %65, %66) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %67 = "byre.alias"(%alloc) {offset = 3776512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%65, %16, %67) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%67, %arg68, %arg69, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %68 = "byre.alias"(%alloc) {offset = 3375104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %69 = "byre.alias"(%alloc) {offset = 4313664 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%59, %68, %69) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %70 = "byre.alias"(%alloc) {offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%68, %17, %70) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%70, %arg73, %arg74, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %71 = "byre.alias"(%alloc) {offset = 75244608 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %72 = "byre.alias"(%alloc) {offset = 4177920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%59, %65, %71, %72) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %73 = "byre.alias"(%alloc) {offset = 950272 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%71, %18, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %74 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%73, %arg88, %arg89, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %75 = "byre.alias"(%alloc) {offset = 1150976 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%71, %19, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %76 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg78, %arg79, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %77 = "byre.alias"(%alloc) {offset = 1351680 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - %78 = "byre.alias"(%alloc) {offset = 59688000 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%76, %77, %78) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown53", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - %79 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%77, %20, %79) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%79, %arg83, %arg84, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %80 = "byre.alias"(%alloc) {offset = 1626112 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - %81 = "byre.alias"(%alloc) {offset = 59700544 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%76, %74, %80, %81) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - byre.compute @ConvOp_f16f16_f16(%80, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %82 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%76, %arg93, %arg94, %82) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %83 = "byre.alias"(%alloc) {offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - %84 = "byre.alias"(%alloc) {offset = 59713088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%82, %83, %84) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - %85 = "byre.alias"(%alloc) {offset = 75646016 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%83, %22, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%85, %arg98, %arg99, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %86 = "byre.alias"(%alloc) {offset = 75846720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%74, %80, %82, %86) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - %87 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%82, %87) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512xf16, "cuda"> - %88 = "byre.alias"(%alloc) {offset = 4203008 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda"> - byre.compute @PTXOp(%87, %88) {BlockSize.x = 128 : i32, GridSize.x = 16 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda"> - %89 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%88, %24, %89) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda"> - %90 = "byre.alias"(%alloc) {offset = 25019456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @PTXOp(%arg103, %89, %90) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> - %91 = "byre.alias"(%alloc) {offset = 25027456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda"> - byre.compute @ReduceMaxOp_f16_f16(%90, %91) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %92 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @PTXOp(%91, %90, %92, %89) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> - %93 = "byre.alias"(%alloc) {offset = 42877824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%89, %93) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %94 = "byre.alias"(%alloc) {offset = 4207104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - %95 = "byre.alias"(%alloc) {offset = 4215104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda"> - %96 = "byre.alias"(%alloc) {offset = 4231104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda"> - byre.compute @PTXOp(%93, %92, %25, %23, %arg1, %94, %95, %96) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf32, "cuda"> - %97 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%94, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda"> - %98 = "byre.alias"(%alloc) {offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%97, %86, %98) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%85, %arg98, %98, %82, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%82, %22, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%83, %82, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%84, %74, %82) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%76, %arg93, %82, %74, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%74, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %99 = "byre.alias"(%alloc) {offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%80, %74, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%98, %76, %81, %83) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%79, %arg83, %83, %76, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%76, %20, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%77, %76, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%78, %85, %76) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %100 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg78, %76, %100, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %19, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%73, %arg88, %83, %100, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %18, %61) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %101 = "byre.alias"(%alloc) {offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> - %102 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%61, %59, %72, %102) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%70, %arg73, %102, %59, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %103 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %17, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %104 = "byre.alias"(%alloc) {offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%68, %59, %104) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%69, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%67, %arg68, %59, %103, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %16, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%65, %103, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%102, %59, %66, %65) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%64, %arg58, %65, %59, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %15, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%62, %59, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%63, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg53, %59, %103, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%58, %arg63, %65, %103, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %105 = "byre.alias"(%alloc) {offset = 46982208 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %106 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> - byre.compute @PTXOp(%105, %46, %57, %56) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown102", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %56, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %107 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %56 = "byre.alias"(%alloc) <{offset = 4960256 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %50, %44, %56) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %57 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%44, %13, %57) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %58 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%57, %arg63, %arg64, %58) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %59 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%44, %14, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %60 = "byre.alias"(%alloc) <{offset = 46501632 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%59, %arg53, %arg54, %60) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %61 = "byre.alias"(%alloc) <{offset = 2551808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %62 = "byre.alias"(%alloc) <{offset = 6704896 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%60, %61, %62) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + byre.compute @ConvOp_f16f16_f16(%61, %15, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %63 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg58, %arg59, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %64 = "byre.alias"(%alloc) <{offset = 4157440 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %65 = "byre.alias"(%alloc) <{offset = 6905600 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%63, %58, %64, %65) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + %66 = "byre.alias"(%alloc) <{offset = 22736640 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%64, %16, %66) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%66, %arg68, %arg69, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %67 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %68 = "byre.alias"(%alloc) <{offset = 2056192 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%63, %67, %68) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + %69 = "byre.alias"(%alloc) <{offset = 9891584 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%67, %17, %69) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%69, %arg73, %arg74, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %70 = "byre.alias"(%alloc) <{offset = 72191744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %71 = "byre.alias"(%alloc) <{offset = 294912 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%63, %64, %70, %71) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + %72 = "byre.alias"(%alloc) <{offset = 495616 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%70, %18, %72) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + %73 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%72, %arg88, %arg89, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %74 = "byre.alias"(%alloc) <{offset = 696320 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%70, %19, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + %75 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%74, %arg78, %arg79, %75) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %76 = "byre.alias"(%alloc) <{offset = 897024 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + %77 = "byre.alias"(%alloc) <{offset = 6457088 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%75, %76, %77) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + byre.compute @ConvOp_f16f16_f16(%76, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + %78 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg83, %arg84, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %79 = "byre.alias"(%alloc) <{offset = 1097728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + %80 = "byre.alias"(%alloc) <{offset = 4820992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%78, %73, %79, %80) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + %81 = "byre.alias"(%alloc) <{offset = 1298432 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%79, %21, %81) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%81, %arg93, %arg94, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %82 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + %83 = "byre.alias"(%alloc) <{offset = 6356736 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%78, %82, %83) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + %84 = "byre.alias"(%alloc) <{offset = 72593152 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%82, %22, %84) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%84, %arg98, %arg99, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %85 = "byre.alias"(%alloc) <{offset = 72793856 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%73, %79, %78, %85) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + %86 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<2048x49xf16, "cuda"> + %87 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<2048xf16, "cuda"> + byre.compute @PTXOp(%86, %87) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 2048 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown62_kernel"} : memref<2048x49xf16, "cuda">, memref<2048xf16, "cuda"> + %88 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda"> + %89 = "byre.alias"(%alloc) <{offset = 5435520 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda"> + byre.compute @PTXOp(%88, %89) {BlockSize.x = 256 : i32, GridSize.x = 2 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda"> + %90 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%89, %24, %90) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda"> + %91 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @PTXOp(%25, %90, %91) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> + %92 = "byre.alias"(%alloc) <{offset = 11529856 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%91, %92) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown65_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + byre.compute @PTXOp(%92, %91, %90) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> + %93 = "byre.alias"(%alloc) <{offset = 47111808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%90, %93) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown67_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + %94 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%93, %94) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4xf16, "cuda"> + %95 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + %96 = "byre.alias"(%alloc) <{offset = 5455744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @PTXOp(%94, %90, %26, %23, %95, %96) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> + %97 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%96, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda"> + %98 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%97, %85, %98) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%84, %arg98, %98, %78, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%78, %22, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%82, %78, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%83, %73, %78) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%81, %arg93, %78, %73, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %21, %78) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%79, %73, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%98, %78, %80, %84) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg83, %84, %73, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%76, %73, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%77, %75, %73) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%74, %arg78, %73, %75, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%75, %19, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %75, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%72, %arg88, %84, %98, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%98, %18, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %99 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %98, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> + %100 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%63, %58, %71, %100) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%69, %arg73, %100, %63, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%63, %17, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %101 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%67, %63, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%68, %58, %63) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%66, %arg68, %63, %58, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %16, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %102 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%64, %58, %102) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + %103 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%100, %63, %65, %103) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> + %104 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg58, %103, %104, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %15, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%61, %104, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%62, %58, %60) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%59, %arg53, %60, %58, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %58, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%57, %arg63, %103, %104, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %105 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %106 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %104, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> + byre.compute @PTXOp(%105, %46, %56, %44) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %44, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + %107 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %12, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %108 = "byre.alias"(%alloc) {offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %108 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%53, %46, %108) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %109 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%54, %107, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown106", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %109, %46, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %11, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %110 = "byre.alias"(%alloc) {offset = 69141568 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %46, %110) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%56, %107, %51, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown110", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %109, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %111 = "byre.alias"(%alloc) {offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%54, %107, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %46, %107, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %11, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %109 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %107, %109) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> + %110 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%44, %46, %51, %110) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %110, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %44) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %111 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%47, %46, %111) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%48, %107, %46) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown114", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %107, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %9, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %112 = "byre.alias"(%alloc) {offset = 73155648 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %112) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %109, %107, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %113 = "byre.alias"(%alloc) {offset = 47785024 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %8, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %114 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> - byre.compute @PTXOp(%113, %37, %42, %41) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown121", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> - %115 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %41, %115, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %7, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %116 = "byre.alias"(%alloc) {offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %115, %116) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%39, %37, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown125", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%36, %arg18, %115, %37, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %6, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %117 = "byre.alias"(%alloc) {offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%30, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%41, %115, %35, %36) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown129", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%33, %arg13, %36, %30, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%30, %5, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %118 = "byre.alias"(%alloc) {offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31, %30, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%32, %115, %30) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown133", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%29, %arg8, %30, %115, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %119 = "byre.alias"(%alloc) {offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%28, %115, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%36, %30, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown137", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @PoolMaxGradOp_f16f16_f16(%26, %115, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> - %120 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> - byre.compute @PTXOp(%27, %3, %120) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown138", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %120, %26, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %121 = "byre.alias"(%alloc) {offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %26, %121) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> - %122 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref - byre.compute @ReduceSumOp_f32_f32(%95, %122) {device = "cuda", dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref - byre.compute @PTXOp(%122, %arg104) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], kernel_name = "Unknown141", memory_effects = [1 : i32, 2 : i32]} : memref, memref - byre.compute @PTXOp(%121, %arg105) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown142", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> - byre.compute @PTXOp(%119, %arg108) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown143", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%118, %arg111) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown144", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%117, %arg114) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown145", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%116, %arg117) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown146", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%112, %arg120) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown147", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%114, %arg126) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> - byre.compute @PTXOp(%110, %arg129) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown151", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown152", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%17, %arg138) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown153", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> - byre.compute @PTXOp(%16, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%104, %arg147) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown157", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%21, %arg153) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown158", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%101, %arg156) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> - byre.compute @PTXOp(%99, %arg159) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - %123 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%88, %94, %123) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%123, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown163", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> - %124 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000xf32, "cuda"> - byre.compute @ReduceSumOp_f32_f32(%96, %124) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<1000xf32, "cuda"> - byre.compute @PTXOp(%124, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda"> + byre.compute @PTXOp(%48, %44, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %112 = "byre.alias"(%alloc) <{offset = 9514752 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %112, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%112, %9, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %113 = "byre.alias"(%alloc) <{offset = 56028928 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %112, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %110, %46, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + %114 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %8, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %115 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %46, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> + %116 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%114, %41, %42, %116) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %116, %37, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %117 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%39, %40, %37) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%31, %arg18, %37, %38, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%38, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %118 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%35, %38, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%116, %31, %36, %35) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%34, %arg13, %35, %31, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%31, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %119 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%32, %31, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%33, %34, %31) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%30, %arg8, %31, %34, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%34, %4, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %120 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29, %34, %120) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%35, %31, %34) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown143", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @PoolMaxGradOp_f16f16_f16(%27, %34, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%28, %3, %27) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown144", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %27, %3, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %3, %1) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> + %121 = "byre.alias"(%alloc) <{offset = 62978176 : i64}> : (memref<76533504xi8, "cuda">) -> memref + %122 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> : (memref<76533504xi8, "cuda">) -> memref<32x125xf16, "cuda"> + %123 = "byre.alias"(%arg1) <{offset = 0 : i64}> : (memref<4x1000xf32, "cuda">) -> memref<32x125xf32, "cuda"> + %124 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> : (memref<76533504xi8, "cuda">) -> memref<32xf32, "cuda"> + byre.compute @PTXOp(%122, %123, %124) {BlockSize.x = 128 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel"} : memref<32x125xf16, "cuda">, memref<32x125xf32, "cuda">, memref<32xf32, "cuda"> + byre.compute @PTXOp(%124, %121) {BlockSize.x = 32 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 1 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel_0"} : memref<32xf32, "cuda">, memref + byre.compute @PTXOp(%121, %arg104) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref, memref + byre.compute @PTXOp(%1, %arg105) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> + byre.compute @PTXOp(%120, %arg108) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%119, %arg111) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%118, %arg114) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%117, %arg117) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%113, %arg120) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%115, %arg126) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> + byre.compute @PTXOp(%109, %arg129) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%15, %arg138) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> + byre.compute @PTXOp(%102, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%101, %arg147) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%20, %arg153) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%99, %arg156) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown166", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> + byre.compute @PTXOp(%21, %arg159) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + %125 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%89, %96, %125) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%125, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown170", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> + %126 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000xf32, "cuda"> + byre.compute @PTXOp(%96, %126) {BlockSize.x = 32 : i32, BlockSize.y = 2 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown171_kernel"} : memref<4x1000xf16, "cuda">, memref<1000xf32, "cuda"> + byre.compute @PTXOp(%126, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown172", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda"> return } -} \ No newline at end of file +} + diff --git a/compiler/test/E2E/ResNet18/Whole/host_output.mlir b/compiler/test/E2E/ResNet18/Whole/host_output.mlir index db4102fa0..0845dd8cc 100644 --- a/compiler/test/E2E/ResNet18/Whole/host_output.mlir +++ b/compiler/test/E2E/ResNet18/Whole/host_output.mlir @@ -4,327 +4,332 @@ module @IrToMhlo.2452 attributes {byre.container_module, gpu.container_module} { func.func @main(%arg0: memref<4x3x224x224xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<4x1000xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<64xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<64xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<64xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<64xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<64xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<64xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<64xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<64xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<128xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<128xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<128xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<128xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<128xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<128xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<128xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<128xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<128xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<128xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<128xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<128xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<128xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<128xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<256xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<256xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<256xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<256xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<256xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<256xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<256xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<256xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<256xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<256xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<256xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<256xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<512xf32, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<512xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<512xf32, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<512xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<512xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<512xf32, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<512xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<512xf32, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<512xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<512xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<512xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<512xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<512xf32, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<512xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<512xf32, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<512xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<512xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<512xf32, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<1000x512xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1000xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg105: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg106: memref<64xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg107: memref<64xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg108: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg109: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg110: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg111: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg112: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg113: memref<64xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg114: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg115: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg116: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg117: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg118: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg119: memref<64xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg120: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg121: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg122: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg123: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg124: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg125: memref<128xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg126: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg127: memref<128xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg128: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg129: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg130: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg131: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg132: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg133: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg135: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg136: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg137: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg138: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg139: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg140: memref<256xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg141: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg142: memref<256xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg143: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg144: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg147: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg150: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg151: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg152: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg153: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg156: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg159: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg162: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg164: memref<512xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg165: memref<1000x512xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg166: memref<1000xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} { - %alloc = memref.alloc() : memref<76022848xi8, "cuda"> - %0 = "byre.alias"(%alloc) {device = "cuda", offset = 8012864 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda"> - byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 128 : i32, GridSize.x = 4704 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda"> - %1 = "byre.alias"(%alloc) {device = "cuda", offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> - byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> - %2 = "byre.alias"(%alloc) {device = "cuda", offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> + %alloc = memref.alloc() : memref<76533504xi8, "cuda"> + %0 = "byre.alias"(%alloc) <{offset = 75329280 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda"> + byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 256 : i32, GridSize.x = 588 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda"> + %1 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> + byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda"> + %2 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%0, %1, %2) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> - %3 = "byre.alias"(%alloc) {device = "cuda", offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> + %3 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%2, %arg3, %arg4, %3) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda"> - %4 = "byre.alias"(%alloc) {device = "cuda", offset = 5080128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %5 = "byre.alias"(%alloc) {device = "cuda", offset = 5006400 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %6 = "byre.alias"(%alloc) {device = "cuda", offset = 1552384 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown5", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %7 = "byre.alias"(%alloc) {device = "cuda", offset = 5153856 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> - %8 = "byre.alias"(%alloc) {device = "cuda", offset = 4247104 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> - byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> - %9 = "byre.alias"(%alloc) {device = "cuda", offset = 602112 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> - %10 = "byre.alias"(%alloc) {device = "cuda", offset = 2383872 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> - %11 = "byre.alias"(%alloc) {device = "cuda", offset = 2088960 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> - %12 = "byre.alias"(%alloc) {device = "cuda", offset = 2678784 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown11", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> - %13 = "byre.alias"(%alloc) {device = "cuda", offset = 4940864 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> - byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> - %14 = "byre.alias"(%alloc) {device = "cuda", offset = 60228672 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> - %15 = "byre.alias"(%alloc) {device = "cuda", offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> - %16 = "byre.alias"(%alloc) {device = "cuda", offset = 18850880 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown15", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> - %17 = "byre.alias"(%alloc) {device = "cuda", offset = 6833216 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> - %18 = "byre.alias"(%alloc) {device = "cuda", offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> - byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> - %19 = "byre.alias"(%alloc) {device = "cuda", offset = 21636160 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> - %20 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> - %21 = "byre.alias"(%alloc) {device = "cuda", offset = 33432640 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> - %22 = "byre.alias"(%alloc) {device = "cuda", offset = 28714048 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> - %23 = "byre.alias"(%alloc) {device = "cuda", offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda"> - %24 = "byre.alias"(%alloc) {device = "cuda", offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> - %25 = "byre.alias"(%alloc) {device = "cuda", offset = 757568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%23, %25) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %26 = "byre.alias"(%alloc) {device = "cuda", offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> - %27 = "byre.alias"(%alloc) {device = "cuda", offset = 59827264 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda"> - byre.compute @PTXOp(%3, %26, %27) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda"> - %28 = "byre.alias"(%alloc) {device = "cuda", offset = 5227584 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @PoolMaxOp_f16_f16(%26, %28) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %29 = "byre.alias"(%alloc) {device = "cuda", offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%28, %4, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %30 = "byre.alias"(%alloc) {device = "cuda", offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%29, %arg8, %arg9, %30) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %31 = "byre.alias"(%alloc) {device = "cuda", offset = 17245248 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - %32 = "byre.alias"(%alloc) {device = "cuda", offset = 301056 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%30, %31, %32) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %33 = "byre.alias"(%alloc) {device = "cuda", offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%31, %5, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %34 = "byre.alias"(%alloc) {device = "cuda", offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%33, %arg13, %arg14, %34) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %35 = "byre.alias"(%alloc) {device = "cuda", offset = 501760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%34, %28, %30, %35) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown28", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %36 = "byre.alias"(%alloc) {device = "cuda", offset = 14033984 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%30, %6, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %37 = "byre.alias"(%alloc) {device = "cuda", offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%36, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %38 = "byre.alias"(%alloc) {device = "cuda", offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - %39 = "byre.alias"(%alloc) {device = "cuda", offset = 200704 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %40 = "byre.alias"(%alloc) {device = "cuda", offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %4 = "byre.alias"(%alloc) <{offset = 5545728 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %5 = "byre.alias"(%alloc) <{offset = 5361664 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %6 = "byre.alias"(%alloc) <{offset = 6283008 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %7 = "byre.alias"(%alloc) <{offset = 6209280 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda"> + %8 = "byre.alias"(%alloc) <{offset = 5463808 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> + byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda"> + %9 = "byre.alias"(%alloc) <{offset = 6557440 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda"> + %10 = "byre.alias"(%alloc) <{offset = 2256896 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + %11 = "byre.alias"(%alloc) <{offset = 1761280 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + %12 = "byre.alias"(%alloc) <{offset = 0 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda"> + %13 = "byre.alias"(%alloc) <{offset = 5480192 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> + byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda"> + %14 = "byre.alias"(%alloc) <{offset = 5619456 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda"> + %15 = "byre.alias"(%alloc) <{offset = 74149632 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + %16 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + %17 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda"> + %18 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> + byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda"> + %19 = "byre.alias"(%alloc) <{offset = 23162624 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda"> + %20 = "byre.alias"(%alloc) <{offset = 28733184 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + %21 = "byre.alias"(%alloc) <{offset = 33451776 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + %22 = "byre.alias"(%alloc) <{offset = 38170368 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda"> + %23 = "byre.alias"(%alloc) <{offset = 5439616 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda"> + %24 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda"> + %25 = "byre.alias"(%alloc) <{offset = 73993984 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<1000xf16, "cuda"> + byre.compute @PTXOp(%arg103, %25) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf16, "cuda"> + %26 = "byre.alias"(%alloc) <{offset = 5435392 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%23, %26) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown25_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + %27 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> + %28 = "byre.alias"(%alloc) <{offset = 25521920 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda"> + byre.compute @PTXOp(%3, %27, %28) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda"> + %29 = "byre.alias"(%alloc) <{offset = 15134464 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @PoolMaxOp_f16_f16(%27, %29) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %30 = "byre.alias"(%alloc) <{offset = 16740096 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%29, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %31 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%30, %arg8, %arg9, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %32 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %33 = "byre.alias"(%alloc) <{offset = 69381888 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%31, %32, %33) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + %34 = "byre.alias"(%alloc) <{offset = 7106304 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%32, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%34, %arg13, %arg14, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %35 = "byre.alias"(%alloc) <{offset = 18345728 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %36 = "byre.alias"(%alloc) <{offset = 70987520 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%31, %29, %35, %36) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + byre.compute @ConvOp_f16f16_f16(%35, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %37 = "byre.alias"(%alloc) <{offset = 44494592 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%31, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %38 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + %39 = "byre.alias"(%alloc) <{offset = 57339648 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + %40 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%38, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> - %41 = "byre.alias"(%alloc) {device = "cuda", offset = 10822720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - %42 = "byre.alias"(%alloc) {device = "cuda", offset = 401408 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> - byre.compute @PTXOp(%37, %30, %41, %42) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown32", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> - %43 = "byre.alias"(%alloc) {device = "cuda", offset = 61621312 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%41, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %44 = "byre.alias"(%alloc) {device = "cuda", offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %41 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda"> + %42 = "byre.alias"(%alloc) <{offset = 70184704 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda"> + byre.compute @PTXOp(%41, %35, %37, %42) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda"> + %43 = "byre.alias"(%alloc) <{offset = 58142464 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%37, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %44 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%43, %arg38, %arg39, %44) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %45 = "byre.alias"(%alloc) {device = "cuda", offset = 70452288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%41, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %46 = "byre.alias"(%alloc) {device = "cuda", offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %45 = "byre.alias"(%alloc) <{offset = 59748096 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%37, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %46 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%45, %arg28, %arg29, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %47 = "byre.alias"(%alloc) {device = "cuda", offset = 71255104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %48 = "byre.alias"(%alloc) {device = "cuda", offset = 4740160 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %49 = "byre.alias"(%alloc) {device = "cuda", offset = 72057920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %47 = "byre.alias"(%alloc) <{offset = 60550912 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %48 = "byre.alias"(%alloc) <{offset = 3756032 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %49 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%47, %10, %49) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%49, %arg33, %arg34, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %50 = "byre.alias"(%alloc) {device = "cuda", offset = 69649472 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %51 = "byre.alias"(%alloc) {device = "cuda", offset = 4790336 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %52 = "byre.alias"(%alloc) {device = "cuda", offset = 60818496 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %50 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %51 = "byre.alias"(%alloc) <{offset = 3354624 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %52 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%50, %11, %52) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%52, %arg43, %arg44, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %53 = "byre.alias"(%alloc) {device = "cuda", offset = 57418816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %54 = "byre.alias"(%alloc) {device = "cuda", offset = 4840512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown39", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %55 = "byre.alias"(%alloc) {device = "cuda", offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %53 = "byre.alias"(%alloc) <{offset = 58945280 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %54 = "byre.alias"(%alloc) <{offset = 2953216 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %55 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvOp_f16f16_f16(%53, %12, %55) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> byre.compute @BatchNormTrainingOp_f16f32f32_f16(%55, %arg48, %arg49, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda"> - %56 = "byre.alias"(%alloc) {device = "cuda", offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - %57 = "byre.alias"(%alloc) {device = "cuda", offset = 4890688 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> - byre.compute @PTXOp(%46, %50, %56, %57) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown41", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> - %58 = "byre.alias"(%alloc) {device = "cuda", offset = 2973696 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%56, %13, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %59 = "byre.alias"(%alloc) {device = "cuda", offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%58, %arg63, %arg64, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %60 = "byre.alias"(%alloc) {device = "cuda", offset = 59024448 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%56, %14, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %61 = "byre.alias"(%alloc) {device = "cuda", offset = 46580800 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg53, %arg54, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %62 = "byre.alias"(%alloc) {device = "cuda", offset = 58623040 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %63 = "byre.alias"(%alloc) {device = "cuda", offset = 4263488 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%61, %62, %63) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %64 = "byre.alias"(%alloc) {device = "cuda", offset = 58221632 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%62, %15, %64) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%64, %arg58, %arg59, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %65 = "byre.alias"(%alloc) {device = "cuda", offset = 4338752 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %66 = "byre.alias"(%alloc) {device = "cuda", offset = 4288576 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%61, %59, %65, %66) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %67 = "byre.alias"(%alloc) {device = "cuda", offset = 3776512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%65, %16, %67) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%67, %arg68, %arg69, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %68 = "byre.alias"(%alloc) {device = "cuda", offset = 3375104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %69 = "byre.alias"(%alloc) {device = "cuda", offset = 4313664 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%59, %68, %69) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %70 = "byre.alias"(%alloc) {device = "cuda", offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%68, %17, %70) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%70, %arg73, %arg74, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> - %71 = "byre.alias"(%alloc) {device = "cuda", offset = 75244608 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - %72 = "byre.alias"(%alloc) {device = "cuda", offset = 4177920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> - byre.compute @PTXOp(%59, %65, %71, %72) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> - %73 = "byre.alias"(%alloc) {device = "cuda", offset = 950272 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%71, %18, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %74 = "byre.alias"(%alloc) {device = "cuda", offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%73, %arg88, %arg89, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %75 = "byre.alias"(%alloc) {device = "cuda", offset = 1150976 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%71, %19, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %76 = "byre.alias"(%alloc) {device = "cuda", offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg78, %arg79, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %77 = "byre.alias"(%alloc) {device = "cuda", offset = 1351680 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - %78 = "byre.alias"(%alloc) {device = "cuda", offset = 59688000 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%76, %77, %78) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown53", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - %79 = "byre.alias"(%alloc) {device = "cuda", offset = 0 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%77, %20, %79) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%79, %arg83, %arg84, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %80 = "byre.alias"(%alloc) {device = "cuda", offset = 1626112 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - %81 = "byre.alias"(%alloc) {device = "cuda", offset = 59700544 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%76, %74, %80, %81) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown55", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - byre.compute @ConvOp_f16f16_f16(%80, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %82 = "byre.alias"(%alloc) {device = "cuda", offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%76, %arg93, %arg94, %82) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %83 = "byre.alias"(%alloc) {device = "cuda", offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - %84 = "byre.alias"(%alloc) {device = "cuda", offset = 59713088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%82, %83, %84) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - %85 = "byre.alias"(%alloc) {device = "cuda", offset = 75646016 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvOp_f16f16_f16(%83, %22, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormTrainingOp_f16f32f32_f16(%85, %arg98, %arg99, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> - %86 = "byre.alias"(%alloc) {device = "cuda", offset = 75846720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> - byre.compute @PTXOp(%74, %80, %82, %86) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown59", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> - %87 = "byre.alias"(%alloc) {device = "cuda", offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%82, %87) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512xf16, "cuda"> - %88 = "byre.alias"(%alloc) {device = "cuda", offset = 4203008 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda"> - byre.compute @PTXOp(%87, %88) {BlockSize.x = 128 : i32, GridSize.x = 16 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda"> - %89 = "byre.alias"(%alloc) {device = "cuda", offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%88, %24, %89) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda"> - %90 = "byre.alias"(%alloc) {device = "cuda", offset = 25019456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @PTXOp(%arg103, %89, %90) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> - %91 = "byre.alias"(%alloc) {device = "cuda", offset = 25027456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda"> - byre.compute @ReduceMaxOp_f16_f16(%90, %91) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %92 = "byre.alias"(%alloc) {device = "cuda", offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - byre.compute @PTXOp(%91, %90, %92, %89) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> - %93 = "byre.alias"(%alloc) {device = "cuda", offset = 42877824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda"> - byre.compute @ReduceSumOp_f16_f16(%89, %93) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> - %94 = "byre.alias"(%alloc) {device = "cuda", offset = 4207104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda"> - %95 = "byre.alias"(%alloc) {device = "cuda", offset = 4215104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda"> - %96 = "byre.alias"(%alloc) {device = "cuda", offset = 4231104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda"> - byre.compute @PTXOp(%93, %92, %25, %23, %arg1, %94, %95, %96) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf32, "cuda"> - %97 = "byre.alias"(%alloc) {device = "cuda", offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%94, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda"> - %98 = "byre.alias"(%alloc) {device = "cuda", offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @PTXOp(%97, %86, %98) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%85, %arg98, %98, %82, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%82, %22, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%83, %82, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%84, %74, %82) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%76, %arg93, %82, %74, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%74, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %99 = "byre.alias"(%alloc) {device = "cuda", offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%80, %74, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%98, %76, %81, %83) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%79, %arg83, %83, %76, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%76, %20, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%77, %76, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> - byre.compute @PTXOp(%78, %85, %76) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> - %100 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg78, %76, %100, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %19, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%73, %arg88, %83, %100, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %18, %61) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %101 = "byre.alias"(%alloc) {device = "cuda", offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> - %102 = "byre.alias"(%alloc) {device = "cuda", offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @PTXOp(%61, %59, %72, %102) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%70, %arg73, %102, %59, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %103 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %17, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - %104 = "byre.alias"(%alloc) {device = "cuda", offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%68, %59, %104) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%69, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%67, %arg68, %59, %103, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %16, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%65, %103, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%102, %59, %66, %65) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%64, %arg58, %65, %59, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %15, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%62, %59, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> - byre.compute @PTXOp(%63, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg53, %59, %103, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%58, %arg63, %65, %103, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> - %105 = "byre.alias"(%alloc) {device = "cuda", offset = 46982208 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %106 = "byre.alias"(%alloc) {device = "cuda", offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> - byre.compute @PTXOp(%105, %46, %57, %56) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown102", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %56, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %107 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + %56 = "byre.alias"(%alloc) <{offset = 4960256 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda"> + byre.compute @PTXOp(%46, %50, %44, %56) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda"> + %57 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%44, %13, %57) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %58 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%57, %arg63, %arg64, %58) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %59 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%44, %14, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %60 = "byre.alias"(%alloc) <{offset = 46501632 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%59, %arg53, %arg54, %60) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %61 = "byre.alias"(%alloc) <{offset = 2551808 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %62 = "byre.alias"(%alloc) <{offset = 6704896 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%60, %61, %62) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + byre.compute @ConvOp_f16f16_f16(%61, %15, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %63 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg58, %arg59, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %64 = "byre.alias"(%alloc) <{offset = 4157440 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %65 = "byre.alias"(%alloc) <{offset = 6905600 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%63, %58, %64, %65) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + %66 = "byre.alias"(%alloc) <{offset = 22736640 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%64, %16, %66) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%66, %arg68, %arg69, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %67 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %68 = "byre.alias"(%alloc) <{offset = 2056192 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%63, %67, %68) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + %69 = "byre.alias"(%alloc) <{offset = 9891584 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%67, %17, %69) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%69, %arg73, %arg74, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda"> + %70 = "byre.alias"(%alloc) <{offset = 72191744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + %71 = "byre.alias"(%alloc) <{offset = 294912 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda"> + byre.compute @PTXOp(%63, %64, %70, %71) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda"> + %72 = "byre.alias"(%alloc) <{offset = 495616 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%70, %18, %72) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + %73 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%72, %arg88, %arg89, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %74 = "byre.alias"(%alloc) <{offset = 696320 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%70, %19, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + %75 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%74, %arg78, %arg79, %75) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %76 = "byre.alias"(%alloc) <{offset = 897024 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + %77 = "byre.alias"(%alloc) <{offset = 6457088 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%75, %76, %77) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + byre.compute @ConvOp_f16f16_f16(%76, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + %78 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg83, %arg84, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %79 = "byre.alias"(%alloc) <{offset = 1097728 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + %80 = "byre.alias"(%alloc) <{offset = 4820992 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%78, %73, %79, %80) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + %81 = "byre.alias"(%alloc) <{offset = 1298432 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%79, %21, %81) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%81, %arg93, %arg94, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %82 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + %83 = "byre.alias"(%alloc) <{offset = 6356736 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%78, %82, %83) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + %84 = "byre.alias"(%alloc) <{offset = 72593152 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvOp_f16f16_f16(%82, %22, %84) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormTrainingOp_f16f32f32_f16(%84, %arg98, %arg99, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda"> + %85 = "byre.alias"(%alloc) <{offset = 72793856 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda"> + byre.compute @PTXOp(%73, %79, %78, %85) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda"> + %86 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<2048x49xf16, "cuda"> + %87 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<2048xf16, "cuda"> + byre.compute @PTXOp(%86, %87) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 2048 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown62_kernel"} : memref<2048x49xf16, "cuda">, memref<2048xf16, "cuda"> + %88 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda"> + %89 = "byre.alias"(%alloc) <{offset = 5435520 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda"> + byre.compute @PTXOp(%88, %89) {BlockSize.x = 256 : i32, GridSize.x = 2 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown63", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda"> + %90 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%89, %24, %90) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda"> + %91 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @PTXOp(%25, %90, %91) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> + %92 = "byre.alias"(%alloc) <{offset = 11529856 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%91, %92) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown65_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + byre.compute @PTXOp(%92, %91, %90) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> + %93 = "byre.alias"(%alloc) <{offset = 47111808 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%90, %93) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown67_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda"> + %94 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda"> + byre.compute @PTXOp(%93, %94) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown68", memory_effects = [1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4xf16, "cuda"> + %95 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + %96 = "byre.alias"(%alloc) <{offset = 5455744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda"> + byre.compute @PTXOp(%94, %90, %26, %23, %95, %96) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda"> + %97 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%96, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda"> + %98 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda"> + byre.compute @PTXOp(%97, %85, %98) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%84, %arg98, %98, %78, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%78, %22, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%82, %78, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%83, %73, %78) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%81, %arg93, %78, %73, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %21, %78) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%79, %73, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%98, %78, %80, %84) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg83, %84, %73, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%76, %73, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda"> + byre.compute @PTXOp(%77, %75, %73) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%74, %arg78, %73, %75, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%75, %19, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %75, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%72, %arg88, %84, %98, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%98, %18, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %99 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %98, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda"> + %100 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%63, %58, %71, %100) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%69, %arg73, %100, %63, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%63, %17, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %101 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%67, %63, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%68, %58, %63) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%66, %arg68, %63, %58, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %16, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + %102 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%64, %58, %102) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + %103 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @PTXOp(%100, %63, %65, %103) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda"> + %104 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg58, %103, %104, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %15, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%61, %104, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda"> + byre.compute @PTXOp(%62, %58, %60) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%59, %arg53, %60, %58, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %58, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%57, %arg63, %103, %104, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda"> + %105 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %106 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %104, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda"> + byre.compute @PTXOp(%105, %46, %56, %44) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %44, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + %107 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %12, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %108 = "byre.alias"(%alloc) {device = "cuda", offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + %108 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%53, %46, %108) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - %109 = "byre.alias"(%alloc) {device = "cuda", offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> - byre.compute @PTXOp(%54, %107, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown106", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %109, %46, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %11, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %110 = "byre.alias"(%alloc) {device = "cuda", offset = 69141568 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %46, %110) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%56, %107, %51, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown110", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %109, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - %111 = "byre.alias"(%alloc) {device = "cuda", offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @PTXOp(%54, %107, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %46, %107, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %11, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %109 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %107, %109) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> + %110 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @PTXOp(%44, %46, %51, %110) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %110, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %44) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %111 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda"> byre.compute @ConvBackwardFilterOp_f16f16_f16(%47, %46, %111) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda"> - byre.compute @PTXOp(%48, %107, %46) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown114", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %107, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %9, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %112 = "byre.alias"(%alloc) {device = "cuda", offset = 73155648 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %112) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %109, %107, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> - %113 = "byre.alias"(%alloc) {device = "cuda", offset = 47785024 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %8, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %114 = "byre.alias"(%alloc) {device = "cuda", offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> - byre.compute @PTXOp(%113, %37, %42, %41) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown121", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> - %115 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %41, %115, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %7, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %116 = "byre.alias"(%alloc) {device = "cuda", offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %115, %116) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%39, %37, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown125", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%36, %arg18, %115, %37, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %6, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %117 = "byre.alias"(%alloc) {device = "cuda", offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%30, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%41, %115, %35, %36) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown129", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%33, %arg13, %36, %30, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%30, %5, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %118 = "byre.alias"(%alloc) {device = "cuda", offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%31, %30, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%32, %115, %30) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown133", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%29, %arg8, %30, %115, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - %119 = "byre.alias"(%alloc) {device = "cuda", offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%28, %115, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> - byre.compute @PTXOp(%36, %30, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown137", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> - byre.compute @PoolMaxGradOp_f16f16_f16(%26, %115, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> - %120 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda"> - byre.compute @PTXOp(%27, %3, %120) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown138", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> - byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %120, %26, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> - %121 = "byre.alias"(%alloc) {device = "cuda", offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda"> - byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %26, %121) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> - %122 = "byre.alias"(%alloc) {device = "cuda", offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref - byre.compute @ReduceSumOp_f32_f32(%95, %122) {device = "cuda", dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref - byre.compute @PTXOp(%122, %arg104) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], device = "cuda", kernel_name = "Unknown141", memory_effects = [1 : i32, 2 : i32]} : memref, memref - byre.compute @PTXOp(%121, %arg105) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown142", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> - byre.compute @PTXOp(%119, %arg108) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown143", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%118, %arg111) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown144", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%117, %arg114) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown145", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%116, %arg117) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown146", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%112, %arg120) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown147", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> - byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%114, %arg126) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> - byre.compute @PTXOp(%110, %arg129) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown151", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown152", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> - byre.compute @PTXOp(%17, %arg138) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown153", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> - byre.compute @PTXOp(%16, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%104, %arg147) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown157", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> - byre.compute @PTXOp(%21, %arg153) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown158", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%101, %arg156) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> - byre.compute @PTXOp(%99, %arg159) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> - %123 = "byre.alias"(%alloc) {device = "cuda", offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda"> - byre.compute @MatmulOp_f16f16_f16(%88, %94, %123) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda"> - byre.compute @PTXOp(%123, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown163", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> - %124 = "byre.alias"(%alloc) {device = "cuda", offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000xf32, "cuda"> - byre.compute @ReduceSumOp_f32_f32(%96, %124) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<1000xf32, "cuda"> - byre.compute @PTXOp(%124, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda"> + byre.compute @PTXOp(%48, %44, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda"> + %112 = "byre.alias"(%alloc) <{offset = 9514752 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %112, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%112, %9, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %113 = "byre.alias"(%alloc) <{offset = 56028928 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %112, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %110, %46, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda"> + %114 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %8, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %115 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %46, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda"> + %116 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda"> + byre.compute @PTXOp(%114, %41, %42, %116) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %116, %37, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %117 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%39, %40, %37) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%31, %arg18, %37, %38, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%38, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %118 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%35, %38, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%116, %31, %36, %35) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%34, %arg13, %35, %31, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%31, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %119 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%32, %31, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%33, %34, %31) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%30, %arg8, %31, %34, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardDataOp_f16f16_f16(%34, %4, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + %120 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%29, %34, %120) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda"> + byre.compute @PTXOp(%35, %31, %34) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown143", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda"> + byre.compute @PoolMaxGradOp_f16f16_f16(%27, %34, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> + byre.compute @PTXOp(%28, %3, %27) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown144", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda"> + byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %27, %3, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda"> + byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %3, %1) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda"> + %121 = "byre.alias"(%alloc) <{offset = 62978176 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref + %122 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<32x125xf16, "cuda"> + %123 = "byre.alias"(%arg1) <{offset = 0 : i64}> {device = "cuda"} : (memref<4x1000xf32, "cuda">) -> memref<32x125xf32, "cuda"> + %124 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<32xf32, "cuda"> + byre.compute @PTXOp(%122, %123, %124) {BlockSize.x = 128 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel"} : memref<32x125xf16, "cuda">, memref<32x125xf32, "cuda">, memref<32xf32, "cuda"> + byre.compute @PTXOp(%124, %121) {BlockSize.x = 32 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 1 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel_0"} : memref<32xf32, "cuda">, memref + byre.compute @PTXOp(%121, %arg104) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], device = "cuda", kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref, memref + byre.compute @PTXOp(%1, %arg105) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda"> + byre.compute @PTXOp(%120, %arg108) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%119, %arg111) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%118, %arg114) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%117, %arg117) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%113, %arg120) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda"> + byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%115, %arg126) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda"> + byre.compute @PTXOp(%109, %arg129) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda"> + byre.compute @PTXOp(%15, %arg138) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda"> + byre.compute @PTXOp(%102, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%101, %arg147) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda"> + byre.compute @PTXOp(%20, %arg153) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%99, %arg156) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown166", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda"> + byre.compute @PTXOp(%21, %arg159) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda"> + %125 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda"> + byre.compute @MatmulOp_f16f16_f16(%89, %96, %125) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda"> + byre.compute @PTXOp(%125, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown170", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda"> + %126 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<1000xf32, "cuda"> + byre.compute @PTXOp(%96, %126) {BlockSize.x = 32 : i32, BlockSize.y = 2 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown171_kernel"} : memref<4x1000xf16, "cuda">, memref<1000xf32, "cuda"> + byre.compute @PTXOp(%126, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown172", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda"> return } } \ No newline at end of file